In [None]:
library(tidyverse)
library(dplyr)
library(RColorBrewer)

Clearly state one broad question that you will address, and the specific question that you have formulated. Your question should involve one response variable of interest and one or more explanatory variables, and should be stated as a question. One common question format is: “Can [explanatory variable(s)] predict [response variable] in [dataset]?”, but you are free to format your question as you choose so long as it is clear. Describe clearly how the data will help you address the question of interest. You may need to describe how you plan to wrangle your data to get it into a form where you can apply one of the predictive methods from this class.

The broad question I aim to address in this project is, "What player characteristics and behaviours are most predictive of subscribing to a game-related newsletter, and how do these features differ between various player types?". This question will be addressed through my specific focus on the ages of players, where I will be trying to answer my specific question of, "Can the ages of minecraft players be used to predict their subscription rates to gaming newsletters in the players dataset?". I believe this can be achieved through grouping players into subsections of each experience level, then using the ages as a predictor to predict subscription to newsletters using a knn and linear regression model, then using rmspe summary statistics of both models to determine which one is better optimized in predicting. 

Provide a full descriptive summary of the dataset, including information such as the number of observations, summary statistics (report values to 2 decimal places), number of variables, name and type of variables, what the variables mean, any issues you see in the data, any other potential issues related to things you cannot directly see, how the data were collected, etc. Make sure to use bullet point lists or tables to summarize the variables in an easy-to-understand format.

Note that the selected dataset(s) will probably contain more variables than you need. In fact, exploring how the different variables in the dataset affect your model may be a crucial part of the project. You need to summarize the full data regardless of which variables you may choose to use later on.

#### Description of Dataset

##### 1. Players Dataset
- 196 rows (observations)
- 7 columns (variables)
    - experience
        - Describes the self-reported level of experience and comfort the player has in Minecraft
        - Divided up into 5 categories: Beginner, Amateur, Regular, Veteran, Pro
        - Character type variable
    - subscribe
        - Denotes whether or not the player is subscribed to a game-related newsletter
        - Logical type variable
    - hashedEmail
        - Denotes the hashed emails of players
        - Character type variable
    - played_hours
        - Denotes the individual play times of players on the experimental Minecraft server
        - Double type variable
    - name
        - Denotes the names of the players who participated in this 

In [None]:
players_time <- players_data |>
    group_by(experience) |>
    summarize(mean(played_hours, na.rm = TRUE))

colnames(players_time) <- c("experience", "average_played_hours")

players_sub <- players_data |>
    group_by(experience) |>
    summarize(sum(subscribe)) 

colnames(players_sub) <- c("experience", "number_subscribed")

players_count <- players_data |>
    group_by(experience) |>
    summarize(count = n()) 

colnames(players_count) <- c("experience", "number_total")

player_numbers <- c(63, 35, 14, 36, 48)

players_sub_percent <- players_sub |>
    mutate(subscribed_percent = number_subscribed / player_numbers) 

players_age <- players_data |>
    group_by(experience) |>
    summarize(mean(Age, na.rm = TRUE))

colnames(players_age) <- c("experience", "mean_age")

players_plot_age <- players_data |>
    ggplot(aes(x = Age, y = played_hours, color = subscribe)) +
    geom_point(alpha = 0.5)

players_plot_gender <- players_data |>
    ggplot(aes(x = gender, fill = subscribe)) +
    geom_bar()

    pivot_wider(names_from = experience, values_from = experience)

colnames(players_tidy) <- c("subscribed", "hashed_email", "played_hours", "name", "gender", "age", "5", "4", "2", "3", "1")

players_tidy
players_tidy2 <- players_tidy |>
    pivot_longer(cols = "5":"1", names_to = "experience_level")
players_tidy2

colnames(players_mean_time) <- c("experience_level", "average_played_hours")

players_mean_time2 <- players_mean_time |>
    pivot_wider(names_from = experience_level, values_from = average_played_hours) |>
    round(2)

colnames(players_mean_time2) <- c("2", "1", "5", "3", "4")

players_mean_time3 <- players_mean_time2 |>
    pivot_longer(cols = "2":"4", names_to = "experience_level", values_to = "average_played_hours") |>
    arrange(experience_level)

colnames(players_mean_age) <- c("experience_level", "average_age")

players_mean_age2 <- players_mean_age |>
    pivot_wider(names_from = experience_level, values_from = average_age) |>
    round(2)

colnames(players_mean_age2) <- c("2", "1", "5", "3", "4")

players_mean_age3 <- players_mean_age2 |>
    pivot_longer(cols = "2":"4", names_to = "experience_level", values_to = "average_age") |>
    arrange(experience_level)

In [None]:
sessions_data <- read_csv("sessions.csv")
players_data <- read_csv("players.csv")

players_tidy <- players_data |>
    pivot_wider(names_from = experience, values_from = experience)

players_tidy1 <- players_tidy |>
    filter(Beginner == "Beginner") |>
    select(- Amateur, - Regular, - Veteran, - Pro)
colnames(players_tidy1) <- c("subscribed", "hashed_email", "played_hours", "name", "gender", "age", "1")
players_tidy1 <- players_tidy1 |>
    pivot_longer(cols = "1", names_to = "experience_level") |>
    select(- value)

players_tidy2 <- players_tidy |>
    filter(Amateur == "Amateur") |>
    select(- Beginner, - Regular, - Veteran, - Pro)
colnames(players_tidy2) <- c("subscribed", "hashed_email", "played_hours", "name", "gender", "age", "2")
players_tidy2 <- players_tidy2 |>
    pivot_longer(cols = "2", names_to = "experience_level") |>
    select(- value)

players_tidy3 <- players_tidy |>
    filter(Regular == "Regular") |>
    select(- Amateur, - Beginner, - Veteran, - Pro)
colnames(players_tidy3) <- c("subscribed", "hashed_email", "played_hours", "name", "gender", "age", "3")
players_tidy3 <- players_tidy3 |>
    pivot_longer(cols = "3", names_to = "experience_level") |>
    select(- value)

players_tidy4 <- players_tidy |>
    filter(Veteran == "Veteran") |>
    select(- Amateur, - Regular, - Beginner, - Pro)
colnames(players_tidy4) <- c("subscribed", "hashed_email", "played_hours", "name", "gender", "age", "4")
players_tidy4 <- players_tidy4 |>
    pivot_longer(cols = "4", names_to = "experience_level") |>
    select(- value)

players_tidy5 <- players_tidy |>
    filter(Pro == "Pro") |>
    select(- Amateur, - Regular, - Veteran, - Beginner)
colnames(players_tidy5) <- c("subscribed", "hashed_email", "played_hours", "name", "gender", "age", "5")
players_tidy5 <- players_tidy5 |>
    pivot_longer(cols = "5", names_to = "experience_level") |>
    select(- value)


players_tidy12 <- full_join(players_tidy1, players_tidy2)
players_tidy123 <- full_join(players_tidy12, players_tidy3)
players_tidy1234 <- full_join(players_tidy123, players_tidy4)
players_tidy_final <- full_join(players_tidy1234, players_tidy5)

players_tidy_final <- players_tidy_final |>
    mutate(experience_level = as.numeric(experience_level))
players_tidy_final

players_mean_time <- players_tidy_final |>
    group_by(experience_level) |>
    summarize(mean(played_hours, na.rm = TRUE)) |>
    round(2)

colnames(players_mean_time) <- c("experience_level", "average_played_hours")

players_mean_age <- players_tidy_final |>
    group_by(experience_level) |>
    summarize(mean(age, na.rm = TRUE)) |>
    round(2)

colnames(players_mean_age) <- c("experience_level", "average_age")


players_mean <- inner_join(players_mean_time, players_mean_age)
players_mean

options(repr.plot.width = 8, repr.plot.height = 8)

players_plot_mean_age <- players_mean |>
    ggplot(aes(x = experience_level, y = average_age)) +
    geom_bar(stat = "identity") +
    labs(x = "Experience level",
         y = "Average age of players") +
    ggtitle("Visualization of the relationship between age \n and self-reported experience levels") +
    theme(text = element_text(size = 18))
players_plot_mean_age

options(repr.plot.width = 10, repr.plot.height = 8)

players_plot_age <- players_tidy_final |>
    ggplot(aes(x = age, fill = subscribed)) +
    geom_histogram(binwidth = 1.8) +
    facet_grid(rows = vars(experience_level)) +
    labs(x = "Age of players (years)",
         y = "Number of players subscribed \n per experience level",
         fill = "Subscribed to newsletter") +
    ggtitle("Distribution between age and subscription \n to newsletters per experience level") +
    theme(text = element_text(size = 18)) +
    scale_fill_manual(values = c("darkorange", "steelblue"))
players_plot_age