In [None]:
library(tidyverse)
library(repr)
library(readxl)
library(tidymodels)
library(themis)
source("cleanup.R")
options(repr.matrix.max.rows = 6)

## Title

## Methods and Results

### Load, wrangle, and split data

In [None]:
#Loading in the Data and cleaning/wrangling
player_data <- read_csv("players.csv")|>
            select(subscribe, played_hours, Age)|>
            mutate(subscribe = as.factor(subscribe))|>
            filter(!is.na(Age), !is.na(played_hours))
set.seed(1234)        
player_split <-initial_split(player_data, prop = 3/4, strata = subscribe)
player_train <- training(player_split)
player_test <- testing(player_split)


### Summary of Data Set

In [None]:
#Summarize Data

nrow(player_data)
ncol(player_data)

In [None]:
summary(player_data)

Below is a summary of the relevant variables and their descriptions for the player data set:

|Variable Name|Data Type|Description/Meaning|Summary Statistics/Values|
|-------------|---------|-------------------|-------------------------|
|subscribe|logical|Whether a player is subscribed or not|True = 142, False = 52|
|player_hours|numeric|Total hours played by each player|Mean = 5.95, Median = 0.1, Min = 0.00, Max = 223.10|
|Age|numeric|Age of each player|Mean = 21.14, Median = 19, Min = 9, Max = 58|

Number of rows: 194 <br>
Number of columns: 3

### Visualizations for Player Data Set - Exploratory Data Analysis

Visualization 1: Distribution of Age

In [None]:
options(repr.plot.width = 14, repr.plot.height = 8)

dist_age_player <- player_data |>
    ggplot(aes(x = Age)) +
    geom_histogram(binwidth = 5) +
    labs(x = "Age of Player (Years)",
         y = "Amount of Players") +
    ggtitle("Distribution of Players Ages")

dist_age_player

Visualization 2" Distribution of Player Hours

In [None]:
dist_player_hours <- player_data |>
    ggplot(aes(x = played_hours)) +
    geom_histogram(binwidth = 10) +
    labs(x = "Hours Players by Each Player (Hours)",
         y = "Amount of Players") +
    ggtitle("Distribution of Players Played Hours")

dist_player_hours

In [None]:
played_hrs_vs_age <- player_data |>
    ggplot(aes(x = played_hours, y = Age, color = subscribe)) +
    geom_point() +
    labs(x = "Played Hours by Each Player (Hours)",
         y = "Age of Player (Years)",
         color = "Subscribed Or Not") +
    ggtitle("Players Age vs Hours Played vs Subscribed or Not")

### Use V fold to decide K

In [None]:
knn_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune())|>
            set_engine('kknn') |>
            set_mode('classification')

player_recipe <- recipe(subscribe ~ played_hours + Age, data = player_train)|>
                step_scale(all_predictors())|>
                step_center(all_predictors())|>
                step_upsample(subscribe, over_ratio = 1, skip = TRUE)

player_vfold <- vfold_cv(player_train, v = 5, strata = subscribe)

k_vals <- tibble(neighbors = seq(from = 1, to = 120, by = 1))

player_k_results <- workflow()|>
                    add_recipe(player_recipe)|>
                    add_model(knn_tune)|>
                    tune_grid(resamples = player_vfold, grid = k_vals)|>
                    collect_metrics()

In [None]:
player_k_accuracy <- player_k_results|>
                    filter(.metric == "accuracy")

player_k_best <- player_k_results|>
                    filter(.metric == "accuracy")|>
                    slice_max(mean)|>
                    pull(neighbors)

player_k_accuracy_plot <- ggplot(player_k_accuracy, aes(x=neighbors, y= mean)) +
                        geom_point()+
                        geom_line()

head(player_k_accuracy)
player_k_accuracy_plot
player_k_best

### Building Model with decided K value

In [None]:
player_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 120)|>
            set_engine('kknn') |>
            set_mode('classification')

player_fit <- workflow()|>
                    add_recipe(player_recipe)|>
                    add_model(player_spec)|>
                    fit(data = player_train)

player_prediction <- predict(player_fit, player_test)|>
                    bind_cols(player_test)

player_prediction_accuracy <- player_prediction|>
                            metrics(truth = subscribe, estimate = .pred_class)
player_prediction_accuracy