Start of project

In [None]:
library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)
source("cleanup.R")

In [None]:
player_data <- read_csv("https://raw.githubusercontent.com/Cna-51/minecraft_indiv/refs/heads/main/players%20(1).csv") |>
    select(-hashedEmail, -name, -experience) |>
    filter(played_hours > 0) |>
    mutate(subscribe = as.factor(subscribe)) |>
    drop_na()
player_data

In [None]:
player_plot <- player_data |>
    ggplot(aes(x = Age, y = played_hours, colour = subscribe)) +
    geom_point() +
    labs(x = "Player's Age (yrs)", y = "Player hours (hrs)", colour = "Subscribed", title = "Player's Age vs Played Hours")
player_plot

In [None]:
player_split <- initial_split(player_data, prop= 0.7-0.3, strata= subscribe) 
player_training <- training(player_split)
player_testing <- testing(player_split)
player_training
player_testing

In [None]:
# First, we'll perform a knn classification with k = 3. 
set.seed(1234)
player_recipe <- recipe(subscribe ~ played_hours + Age, data = player_training) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())
player_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 3) |>
    set_engine("kknn") |>
    set_mode("classification")
player_fit <- workflow() |>
    add_recipe(player_recipe) |>
    add_model(player_spec) |>
    fit(data = player_training)
player_predictions <- predict(player_fit, player_testing) |>
    bind_cols(player_testing)
prediction_accuracy <- player_predictions |>
        metrics(truth = subscribe, estimate = .pred_class)             
prediction_accuracy

In [None]:
# Then a cross validation is performed to test how this model works when split into multiple sections. 
# This will be done while tuning k to determine the most accurate k value. 
vfolds <- vfold_cv(player_training, v = 5, strata = subscribe) 
player_resample <- workflow() |>
    add_recipe(player_recipe) |>
    add_model(player_spec) |>
    fit_resamples(resamples = vfolds) 
resample_metrics <- collect_metrics(player_resample)
player_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("classification") 
k_vals <- tibble(neighbors = seq(from = 1, to = 20, by = 1))
tuned_results <- workflow() |>
      add_recipe(player_recipe) |>
      add_model(player_tune) |>
      tune_grid(resamples = vfolds, grid = k_vals) |>
      collect_metrics()
accuracies <- tuned_results |> 
    filter(.metric == "accuracy") |>
    order(asc(mean)) |>
    slice(1)
accuracies 