# DSCI Group 20 Project Final Report

Group members: 

- Hannah Yin 
- 

In [None]:
library(tidyverse)
library(tidymodels)
library(repr)
options(repr.matrix.max.rows = 6)

## Introduction

## Methods & Results

In [None]:
# load the dataset we are working with

players <- read_csv("https://raw.githubusercontent.com/ahnnahy/dsci100-group20-project/refs/heads/main/players%20(1).csv")

players

In [None]:
set.seed(100)

players <- players |>
    mutate(subscribed = as_factor(subscribe))
players

In [None]:
options(repr.plot.width = 10, repr.plot.height = 10)

players_plot <- ggplot(players, aes(x = Age, y = played_hours, colour = subscribed)) +
    geom_point() +
    labs(x = "Age (years)", y = "Total played hours", color = "Subscribed?", title = "Subscription across total played hours vs. age") +
    theme(text = element_text(size = 20))

        

players_plot

In [None]:
players_split <- initial_split(players, prop = 0.75, strata = subscribed)
players_train <- training(players_split)
players_test <- testing(players_split)

players_vfold <- vfold_cv(players_train, v = 5, strata = subscribed)

players_recipe <- recipe(subscribed ~ Age + played_hours, data = players_train) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())

players_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("classification")

k_vals <- tibble(neighbors = seq(from = 1, to = 100, by = 1))

player_results <- workflow() |>
  add_recipe(players_recipe) |>
  add_model(players_spec) |>
  tune_grid(resamples = players_vfold, grid = k_vals) |>
  collect_metrics()

accuracies <- player_results |>
  filter(.metric == "accuracy")

accuracies


In [None]:
accuracy_vs_k <- ggplot(accuracies, aes(x = neighbors, y = mean)) +
  geom_point() +
  geom_line() +
  labs(x = "Neighbors", y = "Accuracy Estimate") +
  theme(text = element_text(size = 20))

best_k <- accuracies |>
    slice_max(mean, n = 1)

accuracy_vs_k
best_k

We can see above that the best k value to use here is k = 19. We will now run k = 19 on our cross-validation and test on the testing set. 

In [None]:
players_best_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 19) |>
    set_engine("kknn") |>
    set_mode("classification")

player_best_results <- workflow() |>
  add_recipe(players_recipe) |>
  add_model(players_best_spec) |>
  fit(data=players_train)

player_test_predictions <- predict(player_best_results, players_test) |>
  bind_cols(players_test)

player_test_predictions |>
  metrics(truth = subscribed, estimate = .pred_class) |>
  filter(.metric == "accuracy")

We have found that the model accurately predicts if the player is subscribed or not around 75.51% of the time. For a dataset like this, it is an reaso, but it is too low for a dataset on health, for example, that could have large impacts if it predicts wrongly.

## Discussion

## References