# Introduction

In [None]:
library(tidyverse)
library(tidymodels)
library(lubridate)
library(dbplyr)
library(ggplot2)

players<- read_csv("players.csv")
head(players)

### Summary of Variable names
|Number of Variable | Variable Name| Type of Variable | Meaning of The Variable |
| ------------------| ------------ | ---------------- | ----------------------- |
|1                  | hashedEmail  | Character (chr)  | A unique identifier for each player|
|2                  | experience   | Character (chr)  | Player's level of experience |
|3                  | subscribe    | Logical (lgl)	  | TRUE/FALSE indicating whether the player is subscribed to a game-related newsletter	|
|4                  | played_hours | Numeric (dbl)    | Total number of hours the player has spent playing the game for the current session |
|5                  | name         | Character (chr)  | The name of the player|
|6                 | gender       | Character (chr)  | The gender of the player	|
|7                 | Age          | Numeric (dbl)    | The age of the player	|


### Issues
| Issue | Description of Issue | Solution |
| ------| -------------------- | -------- |
| Duplicate Entries | Some participants have multiple records | Ensure these are distinct play sessions and not duplicates. Group data by hashedEmail to verify if multiple sessions correspond to the same user or if it's a data entry error	|
| Missing Data | Missing values in key columns like start_time, end_time, experience, subscribe, played_hours | Ignore missing values drop_na(), Fill in missing values using mean imputation step_impute_mean(all_predictors()) |
| Time Format Consistency | The start_time and end_time columns use a string format | Convert these strings to a proper datetime format |
| Age Distribution | The age range is wide and not evenly distributed | Create age categories (e.g., 0-18, 19-35, 36-50, etc.) to simplify analysis|
| Gender Representation | Some entries for gender are non-binary or unknown | Handle these cases by either grouping non-binary responses into a category or excluding them |
| Mutiple entries | Ensure there are no issues like multiple hashed emails for the same user. This could indicate duplicates or incorrectly handled data during hashing |  Verify that each hashedEmail corresponds to a unique user |
| Inconsistent Session Lengths | The played_hours column can have values that seem unusually short, suggesting that some players may have logged incomplete sessions | Set thresholds for what constitutes a valid session duration. Filter out sessions that fall below the threshold |
| Correlation Between Variables | Columns like experience and played_hours might have a strong relationship | Use correlation metrics or scatter plots to visualize relationships |

In [None]:
players_recode <- players |>
  mutate(subscribe = as_factor(subscribe)) |>
  mutate(subscribe = fct_recode(subscribe, "Subscribed" = "TRUE", "Not subscribed" = "FALSE"))
players_recode

players_mean <- players |>
select(played_hours, Age) |>
map_df(mean, na.rm = TRUE)



In [None]:
options(repr.plot.width=12, repr.plot.height=7)
players_plot <- players_recode |> 
ggplot(aes(x = played_hours, y = Age, color = subscribe)) + 
geom_point() +
labs(x = "Time spent playing (hours)", y = "Age of player (years)", title = "Time played and Age (unstandardized)") + 
theme(text = element_text(size = 15))
players_plot

In [None]:
players_histogram <- ggplot(players_recode, aes(x = played_hours)) + 
geom_histogram(binwidth = 0.5) +
labs(x = "Time spent playing (hours)", y = "Count", title = "Distribution of playing time") + 
theme(text = element_text(size = 15))
       
players_histogram + scale_x_continuous(limits = c(0, 50)) + scale_y_continuous(limits = c(0, 25))

In [None]:
#Classification algorithm:

In [None]:
set.seed(1234) 

options(repr.plot.height = 5, repr.plot.width = 6)

data_recipe <- recipe(subscriber ~ Age + played_hours, data = training_set)

knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("classification")

data_vfold <- vfold_cv(training_set, v = 5, strata = y)

kvalues_tibble <- tibble(neighbors = c(2, 3, 4, 5, 6))

knn_results <- workflow() |> 
    add_recipe(data_recipe) |>
    add_model(knn_spec) |>
    tune_grid(resamples = data_vfold, grid = kvalues_tibble)

data_metrics <- collect_metrics(knn_results)

accuracies <- data_metrics |>
filter(.metric == "accuracy") |>
mutate(neighbors = as.numeric(neighbors), mean = as.numeric(mean)) |>
drop_na()


cross_val_plot <- accuracies |>
ggplot(aes(x = neighbors, y = mean)) + 
geom_point() +
geom_line() + 
labs(x = "K number of neighbors", y = "Accuracy of Model Estimate") + 
theme(text = element_text(size = 12))

cross_val_plot