In [1]:
library(tidyverse)
library(tidymodels)
library(repr)
library(cowplot)
options(repr.matrix.max.rows = 6)
source('cleanup.R')

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.6     [32m✔[39m [34mrsample     [39

ERROR: Error in file(filename, "r", encoding = encoding): cannot open the connection


In [2]:
players <- read_csv("https://raw.githubusercontent.com/ansonansonnn/project-data/refs/heads/main/players.csv")
players

[1mRows: [22m[34m196[39m [1mColumns: [22m[34m7[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (4): experience, hashedEmail, name, gender
[32mdbl[39m (2): played_hours, Age
[33mlgl[39m (1): subscribe

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


experience,subscribe,hashedEmail,played_hours,name,gender,Age
<chr>,<lgl>,<chr>,<dbl>,<chr>,<chr>,<dbl>
Pro,TRUE,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,Male,9
Veteran,TRUE,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa939732842f2312358a88e9,3.8,Christian,Male,17
Veteran,FALSE,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3c5a9d2118eb7ccbb28,0.0,Blake,Male,17
⋮,⋮,⋮,⋮,⋮,⋮,⋮
Amateur,FALSE,d572f391d452b76ea2d7e5e53a3d38bfd7499c7399db299bd4fedb06a46ad5bb,0.0,Dylan,Prefer not to say,57
Amateur,FALSE,f19e136ddde68f365afc860c725ccff54307dedd13968e896a9f890c40aea436,2.3,Harlow,Male,17
Pro,TRUE,d9473710057f7d42f36570f0be83817a4eea614029ff90cf50d8889cdd729d11,0.2,Ahmed,Other,


In [3]:
players_short <- players |>
    select(experience, Age, subscribe) |>
    drop_na()
players_short

experience,Age,subscribe
<chr>,<dbl>,<lgl>
Pro,9,TRUE
Veteran,17,TRUE
Veteran,17,FALSE
⋮,⋮,⋮
Veteran,22,FALSE
Amateur,57,FALSE
Amateur,17,FALSE


In [4]:
set.seed(1000)

# Optimal K

players_short <- players_short |>
    mutate(subscribe = as.factor(subscribe))

knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("classification")

data_split <- initial_split(players_short, prop = 0.7, strata = subscribe) 
    train_data <- training(data_split)
    test_data  <- testing(data_split)

knn_recipe <- recipe(subscribe ~ Age, data = train_data) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())

knn_wf <- workflow() |>
    add_model(knn_spec) |>
    add_recipe(knn_recipe)

k_grid <- grid_regular(neighbors(range = c(1, 10)), levels = 10)

k_vfold <- vfold_cv(train_data, v = 5)

knn_tuned <- tune_grid(knn_wf, resamples = k_vfold, grid = k_grid, metrics = metric_set(accuracy))

k_select <- select_best(knn_tuned, metric = "accuracy")
k_select

neighbors,.config
<int>,<chr>
7,Preprocessor1_Model07


In [5]:
sub_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = k_select) |>
    set_engine("kknn") |>
    set_mode("classification")

sub_recipe <- recipe(subscribe ~ Age, data = players_short) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())

sub_workflow <- workflow() |>
    add_model(sub_spec) |>
    add_recipe(sub_recipe)

sub_workflow

══ Workflow ════════════════════════════════════════════════════════════════════
[3mPreprocessor:[23m Recipe
[3mModel:[23m nearest_neighbor()

── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps

• step_scale()
• step_center()

── Model ───────────────────────────────────────────────────────────────────────
K-Nearest Neighbor Model Specification (classification)

Main Arguments:
  neighbors = k_select
  weight_func = rectangular

Computational engine: kknn 
