In [2]:
library(tidyverse)
library(tidymodels)
library(repr)
library(cowplot)
options(repr.matrix.max.rows = 6)
source('cleanup.R')

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.6     [32m✔[39m [34mrsample     [39

ERROR: Error in file(filename, "r", encoding = encoding): cannot open the connection


**Loading players dataset**

In [None]:
players_data <- read_csv("https://raw.githubusercontent.com/ansonansonnn/project-data/refs/heads/main/players.csv")
players_data

**Players Dataset but only the variables we need: Experience, Age, Subscription Status**

In [None]:
players <- players_data |>
    select(experience, Age, subscribe) |>
    drop_na()
players

**Visual Exploratory Data Analysis**

In [None]:
#1) bar plot by experience

experience_plot <- players_data |>
    ggplot(aes(x = experience, fill = experience))+
    geom_bar(stat = "count")+
    labs(x = "Experience Category",
         y = "Number of Players",
         fill = "Experience Level",
         title = "Number of Players per Experience Level")
experience_plot

#2 distribution (histogram) by age

age_plot <- players |>
    ggplot(aes(x = Age))+
    geom_histogram(binwidth = 10)+
    labs(x = "Age of Players",
         y = "Number of Players",
         title = "Distribution of Ages of Players")
age_plot

#3 bar plot of subsribtion status of players

subscribed_plot <- players |>
    ggplot(aes(x = subscribe))+
    geom_bar()+
    labs(x = "Subscription Status of Players",
         y = "Number of Players",
         title = "Number of Players Subscribed and not Subscribed")
subscribed_plot

#4 bar plot of subscription status and experience proportion

subscribe_experience_plot <- players |>
    ggplot(aes(x = experience, fill = subscribe)) +
    geom_bar() +
    labs(x = "Proportion of Players Subscribed and Not Subscribed per Experience Level",
         y = "Number of Players",
         fill = "Subscribed")
subscribe_experience_plot

#5 proportional bar plot of subscription status and experience

players |>
  mutate(experience = as.factor(experience)) |>
  ggplot(aes(x = experience, fill = subscribe)) +
  geom_bar(position = "fill") +
  labs(
    title = "Subscription Rate by Experience Level",
    y = "Proportion"
  )

**Exploratory Data Analysis**

In [3]:
players_summarised <- players |>
    summarise( 
        mean_player_age = mean(Age),
        median_player_age = median(Age),
        min_age = min(Age),
        max_age = max(Age))
players_summarised

exp_count <- players |>
    count(experience)
exp_count

sub_count <- players |>
    count(subscribe)
sub_count

ERROR: Error in eval(expr, envir, enclos): object 'players' not found


In [4]:
players_data <- read_csv("https://raw.githubusercontent.com/ansonansonnn/dsci100-project/refs/heads/main/players.csv")
players_data

players_clean <- players_data|> drop_na()
players_selected <- players_clean|> select(Age, experience, subscribe)
players_selected

set.seed(123)

players_class <- players_clean |>
mutate(subscribe= factor(subscribe, levels= c(FALSE, TRUE)),
       experience_num= as.numeric((factor(experience, 
                                       levels = c("Beginner", "Amateur", "Regular", "Pro", "Veteran"))))
) |> select(subscribe, Age, experience_num)
    

players_split <- initial_split(players_class, prop= 0.8)
players_train <- training(players_split)
players_test <- testing(players_split)

players_recipe <- recipe(subscribe ~ ., data= players_train) |>
step_center(all_predictors()) |>
step_scale(all_predictors())

knn_model <- nearest_neighbor(mode= "classification", neighbors= tune())|>
set_engine("kknn")

knn_workflow <- workflow() |>
add_model(knn_model) |>
add_recipe(players_recipe)

set.seed(123)

folds <- vfold_cv(players_train, v= 5)

knn_grid <- grid_regular(neighbors(range= c(1,30)), levels= 30)

knn_tune_results <- tune_grid(knn_workflow, resamples= folds, grid= knn_grid, metrics= metric_set(accuracy))

best_knn <- knn_tune_results |> select_best("accuracy")
best_knn

final_knn_workflow <- knn_workflow |> finalize_workflow(best_knn)

final_knn_fit <- final_knn_workflow |> fit(data= players_train)

test_results <- final_knn_fit |> predict(players_test) |>
bind_cols(players_test) |>
metrics(truth= subscribe, estimate= .pred_class)
test_results

conf_matrix <- final_knn_fit |>
predict(players_test) |>
bind_cols(players_test) |>
conf_mat(truth= subscribe, estimate= .pred_class)
conf_matrix

accuracy(final_knn_fit |>
         predict(players_test) |>
         bind_cols(players_test),
         truth= subscribe,
         estimate= .pred_class)

knn_class <- predict(final_knn_fit, players_test) |>
bind_cols(players_test)

ggplot(knn_prob, aes(x= Age, y= experience_num, color= .pred_class))+
geom_point(size= 2, alpha= 0.6)+
labs(x= "Age (in years)",
     y= "Experience Level of Player",
     color= "Subscription Status Prediction",
     title= "Figure 1: KNN Predictions of Subscription Status Based on Age and Experience")

knn_prob <- predict(final_knn_fit, players_test, type= "prob") |>
bind_cols(players_test)

ggplot(knn_prob, aes(x= Age, y= experience_num, color= .pred_TRUE))+
geom_point(size= 2, alpha= 0.6)+
scale_color_gradient(low= "blue", high= "red") +
labs(x= "Age (in years)",
     y= "Experience Level of Player",
     color= "Probability of Player Subscribing",
     title= "Figure 2: KNN Prediction of Probability of Player Subscribing Based on Age and Experience")

knn_prob <- predict(final_knn_fit, players_test, type= "prob") |>
bind_cols(players_test) |>
bind_cols(predict(final_knn_fit, players_test, type= "class"))

ggplot(knn_prob, aes(x = .pred_TRUE, fill = .pred_class)) +
  geom_histogram(binwidth = 0.1, alpha = 0.7, position = "dodge") +
  labs(x = "Predicted Probability of Subscription",
       y = "Count",
       fill = "Predicted Class",
       title = "Figure 3: Histogram of KNN Predicted Subscription Probabilities")



[1mRows: [22m[34m196[39m [1mColumns: [22m[34m7[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (4): experience, hashedEmail, name, gender
[32mdbl[39m (2): played_hours, Age
[33mlgl[39m (1): subscribe

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


experience,subscribe,hashedEmail,played_hours,name,gender,Age
<chr>,<lgl>,<chr>,<dbl>,<chr>,<chr>,<dbl>
Pro,TRUE,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,Male,9
Veteran,TRUE,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa939732842f2312358a88e9,3.8,Christian,Male,17
Veteran,FALSE,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3c5a9d2118eb7ccbb28,0.0,Blake,Male,17
⋮,⋮,⋮,⋮,⋮,⋮,⋮
Amateur,FALSE,d572f391d452b76ea2d7e5e53a3d38bfd7499c7399db299bd4fedb06a46ad5bb,0.0,Dylan,Prefer not to say,57
Amateur,FALSE,f19e136ddde68f365afc860c725ccff54307dedd13968e896a9f890c40aea436,2.3,Harlow,Male,17
Pro,TRUE,d9473710057f7d42f36570f0be83817a4eea614029ff90cf50d8889cdd729d11,0.2,Ahmed,Other,


Age,experience,subscribe
<dbl>,<chr>,<lgl>
9,Pro,TRUE
17,Veteran,TRUE
17,Veteran,FALSE
⋮,⋮,⋮
22,Veteran,FALSE
57,Amateur,FALSE
17,Amateur,FALSE


neighbors,.config
<int>,<chr>
17,Preprocessor1_Model17


.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
accuracy,binary,0.8205128
kap,binary,0.2834646


          Truth
Prediction FALSE TRUE
     FALSE     2    1
     TRUE      6   30

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
accuracy,binary,0.8205128


ERROR: Error in eval(expr, envir, enclos): object 'knn_prob' not found
