In [None]:
library(tidyverse)
library(repr)
library(readxl)
library(tidymodels)
source("cleanup.R")
options(repr.matrix.max.rows = 6)

## Title

## Methods and Results

### Load, wrangle, and split data

In [None]:
#Loading in the Data and cleaning/wrangling
player_data <- read_csv("players.csv")|>
            select(subscribe, played_hours, Age)|>
            mutate(subscribe = as.factor(subscribe))|>
            filter(!is.na(Age), !is.na(played_hours))


set.seed(5)        
player_split <-initial_split(player_data, prop = 3/4, strata = subscribe)
player_train <- training(player_split)
player_test <- testing(player_split)

### Summary of Data Set

In [None]:
#summary of data relevant to analysis

nrow(player_data)
ncol(player_data)

summary(player_data)

**Summary Table of Variables from Player_data** <br>
Below is a summary of the relevant variables and their descriptions for the player data set: <br>

|Variable Name|Data Type|Description/Meaning|Summary Statistics/Values|
|:-------------:|:---------:|:-------------------:|:-------------------------:|
|subscribe| logical | If the player is subscribed to the magazine or not| True = 142, False = 52|
|played_hours| numeric | Total hours played by each player | Mean = 5.95, Median = 0.10, Min = 0.00, Max = 223.10| 
|Age| numeric | Player's age in years | Mean = 21.14, Median = 19.00, Min = 9.00, Max = 58.00|

Number of rows: 194 <br>
Number of Columns: 3

### Visualizations for Player Data Set - Exploratory Data Analysis 

In [None]:
library(RColorBrewer)

Visualization 1: Distribution of Age

In [None]:
options(repr.plot.width = 12, repr.plot.height = 8)

dist_age_player <- player_data |>
    ggplot(aes(x = Age)) +
    geom_histogram(binwidth = 5) +
    labs(x = "Age of Player (Years)",
         y = "Amount of Players") +
    ggtitle("Distribution of Players Ages") +
    theme(text = element_text(size=20)) +
    scale_color_manual(val"steelblue")

dist_age_player

### Use V fold to decide K

In [None]:
knn_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune())|>
            set_engine('kknn') |>
            set_mode('classification')

player_recipe <- recipe(subscribe ~ played_hours + Age, data = player_train)|>
                step_scale(all_predictors())|>
                step_center(all_predictors())

player_vfold <- vfold_cv(player_train, v = 5, strata = subscribe)

k_vals <- tibble(neighbors = seq(from = 1, to = 100, by = 1))

player_k_results <- workflow()|>
                    add_recipe(player_recipe)|>
                    add_model(knn_tune)|>
                    tune_grid(resamples = player_vfold, grid = k_vals)|>
                    collect_metrics()

In [None]:
player_k_accuracy <- player_k_results|>
                    filter(.metric == "accuracy")

player_k_accuracy_plot <- ggplot(player_k_accuracy, aes(x=neighbors, y= mean)) +
                        geom_point()+
                        geom_line()

player_k_best <- player_k_results|>
                    filter(.metric == "accuracy")|>
                    arrange(desc(mean))|>
                    slice(1)|>
                    pull(neighbors)

player_k_accuracy
player_k_accuracy_plot
player_k_best

### Building Model with decided K value

In [None]:
player_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 19)|>
            set_engine('kknn') |>
            set_mode('classification')

player_fit <- workflow()|>
                    add_recipe(player_recipe)|>
                    add_model(player_spec)|>
                    fit(data = player_train)

player_prediction <- predict(player_fit, player_test)|>
                    bind_cols(player_test)

player_prediction_accuracy <- player_prediction|>
                            metrics(truth = subscribe, estimate = .pred_class)
player_prediction_accuracy

In [None]:
#summary of data relevant to analysis

nrow(player_data)
ncol(player_data)

summary(player_data)

**Summary Table of Variables from Player_data**

|Variable Name|Data Type|Description/Meaning|Summary Statistics/Values|
|:-------------:|:---------:|:-------------------:|:-------------------------:|
|subscribe| logical | If the player is subscribed to the magazine or not| True = 142, False = 52|
|played_hours| numeric | Total hours played by each player | Mean = 5.95, Median = 0.10, Min = 0.00, Max = 223.10| 
|Age| numeric | Player's age in years | Mean = 21.14, Median = 19.00, Min = 9.00, Max = 58.00|