In [None]:
library(tidyverse)

#read data
players <- read_csv("players.csv")
sessions <- read_csv("sessions.csv")

#basic info
players_basic_info <- glimpse(players)
sessions_basic_info <- glimpse(sessions)

#summary statistics
players_summary_statistics <- summary(players)
sessions_summary_statistics <- summary(sessions)

players_summary_statistics
sessions_summary_statistics

#data description table
players_data_description <- tibble(variable = names(players), meaning = c( "Player experience level",
    "Subscriber status", "Hashed email (anonymous ID)", "Hours the player has played", "Player name",
    "Player gender", "Player age"))

players_data_description

sessions_data_description <- tibble(variable = names(sessions), meaning = c( "Hashed email linking to players.csv",
    "Session start time (string)", "Session end time (string)", "Original system timestamp (start)",
    "Original system timestamp (end)"))

sessions_data_description

#missing values 
players_missing_values <- players |>
    summarise(across(everything(), ~sum(is.na(.))))

players_missing_values

sessions_missing_values <- sessions |>
    summarise(across(everything(), ~sum(is.na(.))))

sessions_missing_values

#data description tables
players_data_description_table <- tibble(variable = names(players), type = sapply(players, class), 
                                missing = sapply(players, function(x) sum(is.na(x))), 
                                example = sapply(players, function(x) paste(head(x, 3), collapse = ",")),
                                mean = sapply(players, function(x) if(is.numeric(x)) round(mean(x, na.rm = TRUE), 2) else NA),
                                sd = sapply(players, function(x) if(is.numeric(x)) round(sd(x, na.rm = TRUE), 2) else NA),
                                min = sapply(players, function(x) if(is.numeric(x)) min(x, na.rm = TRUE) else NA),
                                max = sapply(players, function(x) if(is.numeric(x)) max(x, na.rm = TRUE) else NA))
players_data_description_table

sessions_data_description_table <- tibble(variable = names(sessions), type = sapply(sessions, class),
                                 missing = sapply(sessions, function(x) sum(is.na(x))),
                                 example = sapply(sessions, function(x) paste(head(x, 3), collapse = ", ")),
                                 mean = sapply(sessions, function(x) if(is.numeric(x)) round(mean(x, na.rm = TRUE), 2) else NA),
                                 sd = sapply(sessions, function(x) if(is.numeric(x)) round(sd(x, na.rm = TRUE), 2) else NA),
                                 min = sapply(sessions, function(x) if(is.numeric(x)) min(x, na.rm = TRUE) else NA),
                                 max = sapply(sessions, function(x) if(is.numeric(x)) max(x, na.rm = TRUE) else NA))
sessions_data_description_table
                                                                                                               

In [None]:
#(1) Data Description:
#Players Data Set
players_data_description_bullets <- c("Players Data Description", 
                            "- Number of Observations: 1372", "- Number of Vaiables: 7",
                            "Summary Statistics: played_hours",
                            "- Median: 0.10", "- Mean: 5.846", "- Standard Deviation: 28.36", "- Min: 0.00", "- Max: 223.10",
                            "Summary Statistics: Age",
                            "- Median: 19.00", "- Mean: 21.14", "- Standard Deviation: 7.39", "- Min: 9.00", "- Max: 58.00") 
cat(players_data_description_bullets, sep = "\n")

players_data_description_table
players_missing_values          


#Sessions Data Set
sessions_data_description_bullets <- c("Sessions Data Description", "- Number of Observations: 7675", "- Number of Vaiables: 5",
                            "Summary Statistics: origional_start_time",
                            "- Median: 1.719e+12", "- Mean: 1.719e+12", "- Standard Deviation: 3557491589", "- Min: 1.712e+12", "- Max: 1.727e+12",
                            "Summary Statistics: origional_end_time",
                            "- Median: 1.719e+12", "- Mean: 1.719e+12", "- Standard Deviation: 3552813134", "- Min: 1.712e+12", "- Max: 1.727e+12") 
cat(sessions_data_description_bullets, sep = "\n")

sessions_data_description_table
sessions_missing_values

In [None]:
'(2) Questions'
'Broad Question: what time windows are most likely to have a large number of simultaneous players?'
'Specific Question: is the classification of weekday or weekend a predictor of a large number of simultaneous players?'

'The data contains player IDs as well as timestamps of player sessions which can be classified as either weekday or weekend,'  
'then we can find the average number of players online per minute and classify this minute as either weekday or weekend. The' 
'number of players online is a numeric respinse variable that can be predicted by a the numeric predictor of average players'
'online per minute.'


In [None]:
#(3) Exploratory Data Analysis and Visualization

library(tidyverse)

#read data
players <- read_csv("players.csv")
sessions <- read_csv("sessions.csv")

#demonstrate successful loading
glimpse(players)
glimpse(sessions)

#Wrangling and means
player_means <- players |>
  select(where(is.numeric)) |>
  summarise(across(everything(), ~ round(mean(.x, na.rm = TRUE), 2)))

player_means_table <- player_means |>
  pivot_longer(everything(), names_to = "variable", values_to = "mean_value")

player_means_table

#relationship between age and played hours
age_vs_played_hours_plot <- players |>
                            ggplot(aes(x = played_hours, y = Age)) +
                            geom_point(alpha = 0.6) +
                            labs(title = "Relationship Between Played Hours and Age",
                            x = "Hours Played", y = "Age")
age_vs_played_hours_plot

#relationship between experience and played hours
experience_vs_played_hours_plot <- players |>
                                group_by(experience) |>
                                summarize(mean_hours = mean(played_hours, na.rm = TRUE)) |>
                                ggplot(aes(x = experience, y = mean_hours, fill = experience)) +
                                geom_col() +
                                labs(title = "Mean Hours Played by Experience Level",
                                x = "Experience Level",
                                y = "Mean Hours Played")
experience_vs_played_hours_plot

#relationship between gender and played hours
gender_vs_played_hours_plot <- players |>
                                group_by(gender) |>
                                summarize(mean_hours = mean(played_hours, na.rm = TRUE)) |>
                                ggplot(aes(x = gender, y = mean_hours, fill = gender)) +
                                geom_col() +
                                labs(title = "Mean Hours Played by Gender",
                                x = "Gender",
                                y = "Mean Hours Played")
gender_vs_played_hours_plot