loading the libraries

In [None]:
#importing the libraries
#  Visualization
library(ggplot2)

# Data manipulation
library(tibble)
library(dplyr)
library(tidyr)
library(purrr)
library(readr)

# Modelling and Preprocessing
library(tidymodels)

# Data splitting and resampling
library(rsample) # train_test_split_equivalent
library(tune) # GridSearchCV equivalent
library(workflows) #make_pipeline equivalent
library(recipes) #make_column_transformer and StandardScaler equivalent

# Metrics
library(yardstick) # for mean squared error

#Time and Dates
library(lubridate)


#formatting graphs
options(repr.plot.width=12, repr.plot.height=6)

Step 2: Loading data in R

In [None]:
#loading data into R
players<-read_csv("players.csv")
players
sessions<- read_csv("sessions.csv")
sessions

In [None]:
# CONVERT START_TIME AND END_TIME TO DATETIME
sessions<-sessions|>
mutate( start_time= dmy_hm(start_time), end_time= dmy_hm(end_time))
sessions

In [None]:
#CONVERT THE TIMEZONE FROM GMT TO PST
sessions <- sessions |>
  mutate(
    start_time = with_tz(force_tz(start_time, tzone = "GMT"), tzone = "America/Los_Angeles"),
    end_time = with_tz(force_tz(end_time, tzone = "GMT"), tzone = "America/Los_Angeles")
  )
sessions 


WRANGLING DATA

In [None]:
#Creating an hourly time range
hours<- seq(from= floor_date(min(sessions$start_time, na.rm = TRUE), unit= "hour"), 
            to = ceiling_date(max(sessions$end_time, na.rm = TRUE), unit= "hour"), 
            by="1 hour")
head(hours)

In [None]:
#Count how many sessions are active during each hour
concurrent_sessions<-map_dfr(hours, function(h){
active_players <- sum(sessions$start_time <= h & sessions$end_time >= h)
tibble( time=h, active_sessions = active_players)})
head(concurrent_sessions)

In [None]:
#Extracting day of week
concurrent_sessions <- concurrent_sessions|>
mutate(day_of_week =wday(time, label = TRUE, abbr = FALSE))
concurrent_sessions

In [None]:
# Create weekday sessions
weekday_sessions <-concurrent_sessions|>
filter(day_of_week %in% c ("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"))|>
mutate( day_of_end ="weekday")
head(weekday_sessions)

In [None]:
# Create weekend sessions
weekend_sessions <- concurrent_sessions |>
  filter(day_of_week %in% c("Saturday", "Sunday")) |>
  mutate(day_or_end = "weekend")
head( weekend_sessions)

In [None]:
# Calculate sessions duration 
sessions_durations <- sessions |>
  mutate(
    play_duration_minutes = as.numeric(difftime(end_time, start_time, units = "mins")),
    day_of_week = wday(start_time, label = TRUE, abbr = FALSE),
    start_hour = hour(start_time),
    start_time_of_day = hour(start_time) + minute(start_time) / 60
  ) |>
  filter(play_duration_minutes > 0)
head(sessions_durations)

In [None]:
#Show concurrent sessions by hour and the plot

concurrent_sessions<- concurrent_sessions|>
mutate(hour= hour(time))

total_sessions_plot<- ggplot(concurrent_sessions, aes(x= factor(hour), y= active_sessions))+
geom_bar(stat= "identity", fill="skyblue")+
labs(
title="Total Actve sessions by hour",
x="Hour of day",
y="Total Active sessions")+
theme_minimal()
total_sessions_plot

In [None]:
# Day of the week exploratory plot

concurrent_sessions<- concurrent_sessions|>
mutate( day_of_week= factor (day_of_week , levels= c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")))
                                                      
dayofweek_exploratory_plot<- ggplot(concurrent_sessions, aes(x= day_of_week, y= active_sessions))+
geom_bar(stat= "identity", fill="green")+
labs(
title="Total sessions by the day of week",
x="DAY OF WEEK",
y="Total Active sessions")+
theme_minimal()
dayofweek_exploratory_plot


In [None]:
#Show concurrent sessions by hour and day of week

concurrent_sessions<- concurrent_sessions|>
mutate(
hour=hour(time),
day_of_week= factor (day_of_week , levels= c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")))

Total_sessions_dayofweek <- ggplot(concurrent_sessions, aes(x=factor(hour), y= active_sessions))+
geom_bar(stat= "identity", fill="pink")+
facet_wrap(~ day_of_week, ncol=1)+
labs(
title="Total sessions by the by hour by day",
x="hour of day",
y="Total Active sessions")+
theme_minimal()
Total_sessions_dayofweek



In [None]:
#Scatter plot of start_time_of_day vs. play_duration_minutes
sessions_durations <-sessions_durations |>
mutate(
 day_of_week= factor (day_of_week , levels= c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")))
head(sessions_durations)

duration_scatter <- ggplot(sessions_durations , aes(x=start_time_of_day , y= play_duration_minutes ))+
geom_point( alpha= 0.4, color="red")+
facet_wrap(~ day_of_week, ncol=1)+
labs(
title="session duration in minutes by the time of day",
x="hour of day",
y="duration of session (minutes)")+
theme_minimal()
duration_scatter

In [None]:
#Bar chart play duration minutes vs day of the week 
avg_duration <- sessions_durations|>
group_by(day_of_week)|>
summarize(avg_play_duration = mean ( play_duration_minutes, na.rm= TRUE))

duration_bar <- ggplot(avg_duration , aes(x= day_of_week  , y= avg_play_duration))+
geom_col( fill="purple")+
labs(
title="session duration in minutes by day of the week",
x="day of the week",
y="duration of session (minutes)")+
theme_minimal()
duration_bar

ANALYSIS OF HOURS REGARDLESS OF WEEK

In [None]:
#Training and testing split 
set.seed(2000)

split_obj<- initial_split( concurrent_sessions, prop= 0.75)

sessions_train <- training(split_obj)
sessions_test <- testing(split_obj)

x_train <- sessions_train|>
select(hour)
head(x_train)

y_train<- sessions_train|>
pull(active_sessions)
head(y_train)

x_test <- sessions_test|>
select(hour)
head(x_test)

y_test<- sessions_test|>
pull(active_sessions)
head(y_test)

In [None]:
# KNN- regression
knn_recipe <- recipe(active_sessions ~ hour, data = sessions_train) |>
  step_scale(all_predictors())|>
  step_center(all_predictors())

# Model specification
knn_spec <- nearest_neighbor( mode = "regression", neighbors = tune() )|>
  set_engine("kknn")

# Workflow
knn_workflow <- workflow() |>
  add_recipe(knn_recipe) |>
  add_model(knn_spec)

# Tuning grid: neighbors 1 to 49
knn_grid <- tibble(neighbors = 1:49)

# Resampling setup: 5-fold CV
set.seed(2000)
folds <- vfold_cv(sessions_train, v = 5)

# Tune the model
knn_tuned <- tune_grid( knn_workflow,
  resamples = folds,
  grid = knn_grid,
  metrics = metric_set(rmse) )

# Get best model
best_knn <- knn_tuned |>
 select_best("rmse")

# Finalize workflow with best number of neighbors
final_knn_workflow <- knn_workflow |>
  finalize_workflow(best_knn)

# Fit on full training data
final_knn_fit <- final_knn_workflow |>
  fit(data = sessions_train)

# Predict on test data
predictions <- predict(final_knn_fit, sessions_test) |>
  bind_cols(sessions_test)

# Calculate RMSE
rmse(predictions, truth = active_sessions, estimate = .pred)

In [None]:
# best rmse
knn_tuned<-tune_grid(knn_workflow, resamples=folds, grid=knn_grid, metrics=metric_set(rmse))
y_train<-pull(sessions_train, active_sessions)
y_train

In [None]:
sessions_test_preds<- predict(final_knn_fit, new_data=sessions_test)|>
bind_cols(sessions_test)

#view and predict RMSPE results
rmse_result<-rmse(data=sessions_test_preds, truth=active_sessions, estimate=.pred)
rmse_result$.estimate

In [None]:
# predict on training data
sessions_train_preds<- predict(final_knn_fit, new_data=sessions_train)|>
bind_cols(sessions_train)

ANALYSIS SEPERATING WEEKENDS AND WEEKDAYS

In [None]:
# split weekday sessions 
set.seed(2000)  # Reproducibility

# 75/25 split

weekday_sessions <- weekday_sessions |>
  mutate(hour = lubridate::hour(time))

weekday_split <- initial_split(weekday_sessions, prop = 0.75)
weekday_train <- training(weekday_split)
weekday_test <- testing(weekday_split)
weekday_train
weekday_test

weekday_recipe <- recipe(active_sessions ~ hour, data = weekday_train) |>
step_scale(all_predictors())|>
step_center(all_predictors())

weekday_knn_spec <- nearest_neighbor( mode = "regression", neighbors = tune()) |>
set_engine("kknn")

weekday_workflow <- workflow() |>
  add_recipe(weekday_recipe) |>
  add_model(knn_spec)

knn_grid <- tibble(neighbors = 1:49)

weekday_resamples <- vfold_cv(weekday_train, v = 5)

weekday_tuned <- tune_grid(
  weekday_workflow,
  resamples = weekday_resamples,
  grid = knn_grid,
  metrics = metric_set(rmse))

best_k <- select_best(weekday_tuned, "rmse")

weekday_final_wf <- finalize_workflow(weekday_workflow, best_k)

weekday_fit <- fit(weekday_final_wf, data = weekday_train)

weekday_test_preds <- predict(weekday_fit, new_data = weekday_test) |>
  bind_cols(weekday_test)

# Calculate RMSE
rmse(weekday_test_preds, truth = active_sessions, estimate = .pred)

weekday_train_preds <- predict(weekday_fit, new_data = weekday_train) |>
  bind_cols(weekday_train)

In [None]:
# for the weekends

weekend_sessions <- weekend_sessions |>
  mutate(hour = lubridate::hour(time))

weekend_split <- initial_split(weekend_sessions, prop = 0.75)
weekend_train <- training(weekend_split)
weekend_test <- testing(weekend_split)
weekend_train
weekend_test

weekend_recipe <- recipe(active_sessions ~ hour, data = weekend_train) |>
step_scale(all_predictors())|>
step_center(all_predictors())

weekend_knn_spec <- nearest_neighbor( mode = "regression", neighbors = tune()) |>
set_engine("kknn")

weekend_workflow <- workflow() |>
  add_recipe(weekend_recipe) |>
  add_model(knn_spec)

knn_grid <- tibble(neighbors = 1:49)

weekend_resamples <- vfold_cv(weekend_train, v = 5)

weekend_tuned <- tune_grid(
  weekend_workflow,
  resamples = weekend_resamples,
  grid = knn_grid,
  metrics = metric_set(rmse))

best_k <- select_best(weekend_tuned, "rmse")

weekend_final_wf <- finalize_workflow(weekend_workflow, best_k)

weekend_fit <- fit(weekend_final_wf, data = weekend_train)

weekend_test_preds <- predict(weekend_fit, new_data = weekend_test) |>
  bind_cols(weekend_test)

# Calculate RMSE
rmse(weekend_test_preds, truth = active_sessions, estimate = .pred)

weekend_train_preds <- predict(weekend_fit, new_data = weekend_train) |>
  bind_cols(weekend_train)

In [None]:
# plot : maximumu concurrent seesions by the day of week

