In [1]:
library(repr)
library(tidyverse)
library(tidymodels)
library(themis)
options(scipen = 999, repr.matrix.max.rows = 6)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.6     [32m✔[39m [34mrsample     [39

In [2]:
# Loading the files and tidying them
og_players <- read_csv("https://raw.githubusercontent.com/achen45-png/toy_ds_project/refs/heads/main/data/players.csv",
                       show_col_types = FALSE)
players <- og_players |>
    mutate(
        experience = as_factor(experience), # Correcting the data types
        subscribe = as_factor(subscribe), # MUST BE CONVERTED TO FACTOR FOR step_dummy() TO WORK !!!!
        gender = as_factor(gender),
        Age = as.integer(Age)
    ) |>
    na.omit()
players

og_sessions <- read_csv("https://raw.githubusercontent.com/achen45-png/toy_ds_project/refs/heads/main/data/sessions.csv",
                        show_col_types = FALSE)
sessions <- og_sessions |>
    mutate(
        start_time = as.POSIXct(start_time, format = "%d/%m/%Y %H:%M", tz = "GMT"), # Optimizing the data types
        end_time = as.POSIXct(end_time, format = "%d/%m/%Y %H:%M", tz = "GMT")
    )
sessions

experience,subscribe,hashedEmail,played_hours,name,gender,Age
<fct>,<fct>,<chr>,<dbl>,<chr>,<fct>,<int>
Pro,TRUE,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,Male,9
Veteran,TRUE,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa939732842f2312358a88e9,3.8,Christian,Male,17
Veteran,FALSE,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3c5a9d2118eb7ccbb28,0.0,Blake,Male,17
⋮,⋮,⋮,⋮,⋮,⋮,⋮
Veteran,FALSE,71453e425f07d10da4fa2b349c83e73ccdf0fb3312f778b35c5802c3292c87bd,0.3,Pascal,Male,22
Amateur,FALSE,d572f391d452b76ea2d7e5e53a3d38bfd7499c7399db299bd4fedb06a46ad5bb,0.0,Dylan,Prefer not to say,57
Amateur,FALSE,f19e136ddde68f365afc860c725ccff54307dedd13968e896a9f890c40aea436,2.3,Harlow,Male,17


hashedEmail,start_time,end_time,original_start_time,original_end_time
<chr>,<dttm>,<dttm>,<dbl>,<dbl>
bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431d8aa0c4bf95ccee6bf,2024-06-30 18:12:00,2024-06-30 18:24:00,1719770000000,1719770000000
36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f575d4acc9cf487c4686,2024-06-17 23:33:00,2024-06-17 23:46:00,1718670000000,1718670000000
f8f5477f5a2e53616ae37421b1c660b971192bd8ff77e3398304c7ae42581fdc,2024-07-25 17:34:00,2024-07-25 17:57:00,1721930000000,1721930000000
⋮,⋮,⋮,⋮,⋮
fd6563a4e0f6f4273580e5fedbd8dda64990447aea5a33cbb5e894a3867ca44d,2024-07-28 15:36:00,2024-07-28 15:57:00,1722180000000,1722180000000
fd6563a4e0f6f4273580e5fedbd8dda64990447aea5a33cbb5e894a3867ca44d,2024-07-25 06:15:00,2024-07-25 06:22:00,1721890000000,1721890000000
36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f575d4acc9cf487c4686,2024-05-20 02:26:00,2024-05-20 02:45:00,1716170000000,1716170000000


In [11]:
# I decided to use best subset selection over forward selection when choosing the subset of predictor variables to include because
# we don't have that many variables anyway, so there's no computational cost for using best subset selection. Forward selection isn't
# as comprehensive and can run into the problem of having a candidate model that is really lucky and falsely accurate.

# First splitting the data
players_split <- initial_split(players, prop = 0.8, strata = played_hours) # 80/20 split for training/testing
players_training <- training(players_split)
players_testing <- testing(players_split)

# Tuning knn model that will be used for every candidate model
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("regression")

# Recipes for each combination of predictor variables. We want to predict played_hours
# 1 predictor
# Recipe 1: experience
recipe_1 <- recipe(played_hours ~ experience, data = players_training) |>
    step_dummy(all_predictors()) |>
    step_zv(all_predictors()) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())
# Recipe 2: subscribe
recipe_2 <- recipe(played_hours ~ subscribe, data = players_training) |>
    step_dummy(all_predictors()) |>
    step_zv(all_predictors()) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())
# Recipe 3: gender
recipe_3 <- recipe(played_hours ~ gender, data = players_training) |>
    step_dummy(all_predictors()) |>
    step_zv(all_predictors()) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())
# Recipe 4: Age
recipe_4 <- recipe(played_hours ~ Age, data = players_training) |>
    #step_dummy(all_predictors(), -Age) |>
    #step_zv(all_predictors()) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())

# 2 predictors
# Recipe 5: experience + subscribe
recipe_5 <- recipe(played_hours ~ experience + subscribe, data = players_training) |>
    step_dummy(all_predictors()) |>
    step_zv(all_predictors()) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())
# Recipe 6: experience + gender
recipe_6 <- recipe(played_hours ~ experience + gender, data = players_training) |>
    step_dummy(all_predictors()) |>
    step_zv(all_predictors()) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())
# Recipe 7: experience + Age
recipe_7 <- recipe(played_hours ~ experience + Age, data = players_training) |>
    step_dummy(all_predictors(), -Age) |>
    step_zv(all_predictors()) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())
# Recipe 8: subscribe + gender
recipe_8 <- recipe(played_hours ~ subscribe + gender, data = players_training) |>
    step_dummy(all_predictors()) |>
    step_zv(all_predictors()) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())
# Recipe 9: subscribe + Age
recipe_9 <- recipe(played_hours ~ subscribe + Age, data = players_training) |>
    step_dummy(all_predictors(), -Age) |>
    step_zv(all_predictors()) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())
# Recipe 10: gender + Age
recipe_10 <- recipe(played_hours ~ gender + Age, data = players_training) |>
    step_dummy(all_predictors(), -Age) |>
    step_zv(all_predictors()) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())

# 3 predictors
# Recipe 11: experience + subscribe + gender
recipe_11 <- recipe(played_hours ~ experience + subscribe + gender, data = players_training) |>
     step_dummy(all_predictors()) |>
    step_zv(all_predictors()) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())
# Recipe 12: experience + subscribe + Age
recipe_12 <- recipe(played_hours ~ experience + subscribe + Age, data = players_training) |>
    step_dummy(all_predictors(), -Age) |>
    step_zv(all_predictors()) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())
# Recipe 13: experience + gender + Age
recipe_13 <- recipe(played_hours ~ experience + gender + Age, data = players_training) |>
    step_dummy(all_predictors(), -Age) |>
    step_zv(all_predictors()) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())
# Recipe 14: subscribe + gender + Age
recipe_14 <- recipe(played_hours ~ subscribe + gender + Age, data = players_training) |>
    step_dummy(all_predictors(), -Age) |>
    step_zv(all_predictors()) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())

# 4 predictors
# Recipe 15: ALL 4 variables, experience + subscribe + gender + Age
recipe_15 <- recipe(played_hours ~ experience + subscribe + gender + Age, data = players_training) |>
    step_dummy(all_predictors(), -Age) |>
    step_zv(all_predictors()) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())
# Probably should've removed unneeded columns like hashedEmail and name beforehand so I can just write played_hours ~ .
# Will fix in the final version

# Tried to make a recipe_printer, shit was impossible so we're stuck with this tedious code :(

# 5-fold cross-validation (would do 10-fold but I have 15 models to run so no way)
vfold <- vfold_cv(players_training, v = 5, strata = played_hours)
kvals <- tibble(neighbors = seq(from = 1, to = 10, by = 1))

workflow_printer <- function(recipe, model) {
    workflow() |>
    add_recipe(recipe) |>
    add_model(model) |>
    tune_grid(resamples = vfold, grid = kvals) |>
    collect_metrics() |>
    filter(.metric == "rmse") |>
    slice_min(mean, n = 1)
}

In [23]:
#workflow_1 <- workflow_printer(recipe_1, knn_spec)
#workflow_2 <- workflow_printer(recipe_2, knn_spec) # RECIPES END UP RETURNING THE SAME RESPONSE VALUE SO AN ERROR OCCURS
#workflow_3 <- workflow_printer(recipe_3, knn_spec)
#workflow_4 <- workflow_printer(recipe_4, knn_spec)
#workflow_5 <- workflow_printer(recipe_5, knn_spec)
#workflow_6 <- workflow_printer(recipe_6, knn_spec)
workflow_7 <- workflow_printer(recipe_7, knn_spec)
#workflow_8 <- workflow_printer(recipe_8, knn_spec)
workflow_9 <- workflow_printer(recipe_9, knn_spec)
workflow_10 <- workflow_printer(recipe_10, knn_spec)
workflow_11 <- workflow_printer(recipe_11, knn_spec)
workflow_12 <- workflow_printer(recipe_12, knn_spec)
workflow_13 <- workflow_printer(recipe_13, knn_spec)
workflow_14 <- workflow_printer(recipe_14, knn_spec)
workflow_15 <- workflow_printer(recipe_15, knn_spec)

best_subset <- rbind(workflow_7, workflow_9, workflow_10, workflow_11,
                     workflow_12, workflow_13, workflow_14, workflow_15) |>
    mutate(workflow_n = c(7, 9, 10:15)) |>
    arrange(desc(mean))
best_subset

#rbind(workflow_14,workflow_15) |> mutate(workflow_num = c(14:15))

neighbors,.metric,.estimator,mean,n,std_err,.config,workflow_n
<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>,<dbl>
9,rmse,standard,26.90348,5,7.224993,Preprocessor1_Model09,12
9,rmse,standard,26.61843,5,8.758165,Preprocessor1_Model09,7
10,rmse,standard,26.37909,5,8.857046,Preprocessor1_Model10,15
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
10,rmse,standard,25.93564,5,9.020197,Preprocessor1_Model10,13
10,rmse,standard,25.20360,5,9.716570,Preprocessor1_Model10,11
1,rmse,standard,23.19682,5,10.620813,Preprocessor1_Model01,10


 # STILL NEED TO FIGURE OUT IF WE NEED TO UPSCALE THE RARE CATEGORIES OF EACH CATEGORICAL PREDICTOR !!!! (so the best subset with smallest rmse and whatnot is bound to change)

In [24]:
players_summary1 <- players |>
    group_by(experience) |>
    summarize(n = n())
players_summary2 <- players |>
    group_by(subscribe) |>
    summarize(n = n())
players_summary3 <- players |>
    group_by(gender) |>
    summarize(n = n())
#players_summary4 <- players |> # BETTER USE A HISTOGRAM TO SEE AGE DISTRIBUTION
    #group_by(Age) |>
    #summarize(n = n())

players_summary1
players_summary2
players_summary3
#players_summary4

# LIKE THERE'S CLEARLY AN UNEQUAL NUMBER OF INDIVIDUALS FOR EACH CATEGORY

experience,n
<fct>,<int>
Pro,13
Veteran,48
Amateur,63
Regular,35
Beginner,35


subscribe,n
<fct>,<int>
False,52
True,142


gender,n
<fct>,<int>
Male,124
Female,37
Non-binary,15
Prefer not to say,11
Agender,2
Two-Spirited,5


In [25]:
# SOME TROUBLESHOOTING: YOU CAN IGNORE THIS CELL


#recipe_3 <- recipe(played_hours ~ gender, data = players_training) |>
#    step_upsample(gender, over_ratio = 1, skip = FALSE) |>
#    step_dummy(all_predictors()) |>
#    step_zv(all_predictors()) |>
#    step_scale(all_predictors()) |>
#    step_center(all_predictors())

#vfold <- vfold_cv(players_training, v = 5, strata = played_hours)
#kvals <- tibble(neighbors = seq(from = 1, to = 10, by = 1))

#workflow_3 <- workflow() |>
#    add_recipe(recipe_3) |>
#    add_model(knn_spec) |>
#    tune_grid(resamples = vfold, grid = kvals)

#workflow_3