### Install libraries

In [None]:
#install.packages("extRemes") # for generating data
#install.packages("ggplot2") # for plotting results
#install.packages("dplyr") # for printing summary tables
#install.packages("tidyr") 
#install.packages("knitr")
#install.packages("IRdisplay") # nicer print in jupyter-lab
#install.packages("tibble")

### Load libraries

In [None]:
library(aftsem)
# you should load aftsem before survival, otherwise one might expect conflicts in packages
library(survival)
library(extRemes) # for data generation
library(ggplot2)
library(dplyr)
library(tidyr)
library(knitr)
library(IRdisplay)
library(tibble)

### About experiments

The results of different experiments are stored as RDS files. Be careful when starting the experiment, you may need to change the RDS file name, otherwise you would rewrite the result of older simulation. Some results are stored in exp folder, you can use them for investigating or for plotting.

### Dataset generator
This function provide user with simple dataset generator that takes three parameters
<ul>
    <li>n = number of samples/observations</li>
    <li>error = error of epsilons</li>
    <li>censor = percent of censoring in dataset (more about this later in simulation functions)</li>
</ul>

The final model looks like this: <b>logT = 2 + x1 + x2 + x3 + eps</b>

In [None]:
datgen_model1 <- function(n = 50, error = "normal", censor = 40)
{
    
  x1 <- rbinom(n, 1, 0.5) # p = 0.5
  x2 <- rnorm(n) # standard
  x3 <- rnorm(n) # standard
  
  e <- numeric(n)
  
  if (error == "normal")
  {
    e <- rnorm(n)
  }
  else if (error == "extreme")
  {
    e <- revd(n, loc = 0, scale = 1, shape = 0)
  }
  else if (error == "logistic") 
  {
    e <- rlogis(n, location = 0, scale = 1)
  }
  
  
  time <- exp(2 + x1 + x2 + x3 + e) # compute the censoring times
  
  if(censor == -1) cen <- rep(10^4,n) # if not censoring, manualy generate extremely large numbers
  else  cen <- runif(n, 0, censor)
  
  # return the data frame with observed time, censoring status, covariates, and ID
  data.frame(Time = pmin(time, cen), status = 1 * (time <= cen), x1 = x1, x2 = x2, x3 =x3, id = 1:n, error = error, censor = censor)
}

In [None]:
# test_data <- datgen_model1(n=400,error="normal",censor=30)

In [None]:
n <- 100000
data_revd <- revd(n, loc = 0, scale = 1, shape = 0)
data_logis <- rlogis(n,location = 0, scale = 1)
data_normal <- rnorm(n)

df_revd <- data.frame(value = data_revd, Rozdělení = 'Extrémní')
df_logis <- data.frame(value = data_logis, Rozdělení = 'Logistické')
df_normal <- data.frame(value = data_normal, Rozdělení = 'Normální')

df_combined <- rbind(df_revd, df_logis, df_normal)

# Plot the densities
pp <- ggplot(df_combined, aes(x = value, color = Rozdělení)) +
  geom_density(alpha = 0.3, size = 0.8) +  # Adjust transparency with alpha and line size
  labs(title = "Hustoty rozdělení", x = "Hodnota", y = "Hustota") +
  theme_minimal() +  # Minimal theme
  scale_color_manual(values = c("Extrémní" = "blue", "Logistické" = "red", "Normální" = "green" ))  # Colors
print(pp)
ggsave("density_plot.png", pp, width = 10, height = 6, dpi = 300)

## Create seed file
Necessary for reproducibility. The file is already generated, user just need to load the text file

In [None]:
#set.seed(54)
#write(sample(1:10000000,size=1000),file="seeds.txt",ncolumns=1)

#### Load seed file

In [None]:
seeds<-read.table("seeds.txt",header=F)$V1

In [None]:
true_coefficients <- c(1,1,1) # setup the true coefficients

## Simulation Config
Simulation config for first experiment, we are interested in precision and effectivility of programmed methods

In [None]:
error_types <- c("normal","extreme","logistic") # all available
methods <- c("gehan", "gehan-poly", "gehan-heller","jin") # all available
censoring <- c(0,25,50,90) # all available

### Bias, MSE and avg Time simulation study on all methods
The function takes two parameters
<ul>
    <li>n_simulations = number of simulations</li>
    <li>sample_size = number of observations in dataset</li>
</ul>

In [None]:
run_sim <- function(n_simulations, sample_size)
{ 
    time_results <- list() # for storing time results
    results <- list() # for storing BIAS and MSE results
    estimates_sim <- list() # for storing estimated regression parameters
    fe_results <- list() # for storing number of calls to functions gehan-poly and gehan-heller, see optimx documentation for more info
    
    n_simulations <- n_simulations
    sample_size <- sample_size

    # Iterate through all different scenarios
    for (error in error_types)
    {
      for (censor in censoring)
          {
              for (method in methods)
              { 
                estimates <- matrix(NA, nrow = n_simulations, ncol = length(true_coefficients)) # to store estimated coefficients
                times <- numeric(n_simulations) # to store elapsed time
                fe_calls <- numeric(n_simulations) # to store function calls  

                for (i in 1:n_simulations)
                {
                  # Generate data
                  set.seed(seeds[i])

                  censorNEW <- NaN # we need to adjust parameter of uniform distribution manualy
                  
                  if(censor == 0) {censorNEW <- -1} 
                  if(censor == 25){censorNEW <- 100} 
                  if(censor == 50) {censorNEW <- 30}
                  if(censor == 90) {censorNEW <- 3}
                    
                  # chosen numbers of censorNEW are adjusted manualy, they give the desired percent of censoring  
                    
                  data <- datgen_model1(n = sample_size, error = error, censor = censorNEW)

                  # measure time of model fitting
                  time_taken <- system.time({
                    fit <- aftsem(Surv(log(data$Time), data$status) ~ data$x1 + data$x2 + data$x3, method = method)
                    estimates[i, ] <- c(fit$beta)
                    if(method %in% c("gehan-heller","gehan-poly")) fe_calls[i] <- fit$fe  
                  })

                  # store elapsed time
                  times[i] <- time_taken[3]
                }

                bias <- colMeans(estimates) - true_coefficients
                mse <- colMeans((estimates - true_coefficients)^2)
                avg_fe_calls <- mean(fe_calls)
                avg_time <- mean(times)

                results[[paste(error, method, censor, sep = "_")]] <- list(bias = bias, mse = mse)
                time_results[[paste(error, method, censor, sep = "_")]] <- avg_time
                fe_results[[paste(error, method, censor, sep = "_")]] <- avg_fe_calls
                estimates_sim[[paste(error, method, censor, sep = "_")]] <- estimates
              }
        }
    }
    # save our simulations into RDS file, change if necessary
    saveRDS(time_results, paste("time_results", sample_size,"2", ".rds", sep = "_"))
    saveRDS(results, paste("results",sample_size,"2",".rds", sep = "_"))
    saveRDS(estimates_sim, paste("estimates_sim",sample_size,"2",".rds", sep = "_"))
    saveRDS(fe_results, paste("fe_results",sample_size,"2",".rds",sep = "_"))
}    

In [None]:
run_sim(n_simulations = 1000, sample_size = 50)

### See the results
This creates nice summary dataframe that contains BIAS,MSE and Avg_time info. The data needs to be stored under names results and time_results

In [None]:
# results <- readRDS("results_400_.rds")
# time_results <- readRDS("time_results_400_.rds")

In [None]:
summary_df <- do.call(rbind, lapply(names(results), function(name) {
  data.frame(
    Scenario = name,
    Bias = results[[name]]$bias,
    MSE = results[[name]]$mse,
    Avg_Time = time_results[[name]]
  )
}))

In [None]:
options(repr.matrix.max.rows = 150, repr.matrix.max.cols = 10) # if we want to see the whole dataframe
display(summary_df)

### Plot our results
This function plots our estimated regression coefficients. The function plots chosen coefficients of all methods, where on the y-axis is ploted Gehan result and on the x-axis are ploted Heller,Poly and Jin. The function takes 4 parameters
<ul>
    <li>estimates = file with estimated regression parameters (for example estimates_sim_100_.rds</li>
    <li>column_number = which of the beta we want to plot (1,2 or 3)</li>
    <li>percen_censoring = which censoring scenario we want to plot</li>
    <li>distribution_name = which epsilon error scenario we want to plot</li>
</ul>

In [None]:
plot_estimates <- function(estimates, column_number, percent_censoring, distribution_name)
{

  # locate our data
  gehan_key <- paste(distribution_name, "gehan", percent_censoring, sep = "_")
  gehan_heller_key <- paste(distribution_name, "gehan-heller", percent_censoring, sep = "_")
  gehan_poly_key <- paste(distribution_name, "gehan-poly", percent_censoring, sep = "_")
  jin_key <-   paste(distribution_name, "jin", percent_censoring, sep = "_")
  
  # create data for plotting
  gehan_data <- estimates[[gehan_key]][, column_number]
  gehan_heller_data <- estimates[[gehan_heller_key]][, column_number]
  gehan_poly_data <- estimates[[gehan_poly_key]][, column_number]
  jin_data <- estimates[[jin_key]][,column_number]  
  
  # First plot: Gehan vs. Gehan-Heller
  df_gehan_heller <- data.frame(Gehan = gehan_data, GehanHeller = gehan_heller_data)
  p1 <- ggplot(df_gehan_heller, aes(x = GehanHeller, y = Gehan)) +
    geom_point(alpha = 0.5) +
    geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
    xlab("Gehan-Heller") +
    ylab("Gehan") +
    ggtitle(paste("Gehan vs. Gehan-Heller", 
                  "(Cenzorovani:", percent_censoring, "%,", distribution_name, ")"))
  
  # Second plot: Gehan vs. Gehan-Poly
  df_gehan_poly <- data.frame(Gehan = gehan_data, GehanPoly = gehan_poly_data)
  p2 <- ggplot(df_gehan_poly, aes(x = GehanPoly, y = Gehan)) +
    geom_point(alpha = 0.5) +
    geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
    xlab("Gehan-Poly") +
    ylab("Gehan") +
    ggtitle(paste("Gehan vs. Gehan-Poly", 
                  "(Cenzorovani:", percent_censoring, "%,", distribution_name, ")"))
    
  df_jin <- data.frame(Gehan = gehan_data, Jin = jin_data)   
  p3 <- ggplot(df_gehan_poly, aes(x = GehanPoly, y = Gehan)) +
    geom_point(alpha = 0.5) +
    geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
    xlab("Jin") +
    ylab("Gehan") +
    ggtitle(paste("Gehan vs. Jin", 
                  "(Cenzorovani:", percent_censoring, "%,", distribution_name, ")"))
 
    
  # save and print  
  ggsave(filename = paste0("plot_Gehan_vs_Gehan-Heller_4", distribution_name, "_", percent_censoring, ".png"), plot = p1)
  ggsave(filename = paste0("plot_Gehan_vs_Gehan-Poly_4", distribution_name, "_", percent_censoring, ".png"), plot = p2)
  ggsave(filename = paste0("plot_Gehan_vs_Gehan vs. Jin_4", distribution_name, "_", percent_censoring, ".png"), plot = p3)
  print(p1)
  print(p2)
  print(p3)
}


plot_estimates(estimates, 2, 50, 'normal')


### Plot time results
Plot time results from previous experiment. The function expect one parameter
<ul>
    <li>data_list=time_result list from first experimet </li>
</ul>

In [None]:
plot_time <- function(data_list)
{    
    # first convert our list to dataframe using pipes
    data <- enframe(data_list, name = "metric", value = "value") %>%
    mutate(
        metric = as.character(metric),
        distribution = gsub("([a-z]+)_.*", "\\1", metric),
        method = gsub("[a-z]+_(.*?)(_\\d+)?$", "\\1", metric),
        percent_censoring = as.numeric(gsub(".*_(\\d+)$", "\\1", metric)),
        value = as.numeric(unlist(value))
    ) %>%
    select(distribution, method, percent_censoring, value)

    # sanity check 
    print(data)

    # plot our result!
    p1 <- ggplot(data, aes(x = percent_censoring, y = value, color = distribution, shape = method)) +
    geom_point(size = 3, alpha = 0.6) +  
    labs(
        title = "Scatter Plot časových výsledků",
        x = "Míra Cenzorování[%]",
        y = "Čas[s]",
        color = "Rozdělení dat",
        shape = "Metoda"
    ) +
    theme_minimal() +
    scale_color_brewer(palette = "Set1") +  
    theme(
        legend.position = "right",
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 10)
    )
    ggsave(filename = paste0("Time_result_100", ".png"), plot = p1)
}

In [None]:
# Test on experiment on 100 observations
results <- readRDS("results_100_.rds")
time_results <- readRDS("time_results_100_.rds")
estimates <- readRDS("estimates_sim_100_.rds")

In [None]:
plot_time(time_results)

### New simulation config
Lets focus now on methods "gehan-heller" and "gehan-poly". We are interested in effect of different optimalization algorithm.

In [None]:
true_coefficients <- c(1,1,1)
error_types <- c("normal") # use only normal distribution now, other distribution give us the similar results
methods <- c("gehan-poly", "gehan-heller")
censoring <- c(25,50,90) # leave 0 percent censoring now and focus only on censored data

### Different optimalization algorithms simulation
The function expect three parameters
<ul>
    <li>n_simulations=number of simulations</li>
    <li>sample_size=number of observations in dataset</li>
    <li>alg=optimalization algorithm</li>
</ul>

The optimalization algorithm could be chosen from any available algorithms in package optimx, see documentation for more information https://cran.r-project.org/web/packages/optimx/index.html

In [None]:
run_simulation_for_gehans <- function(n_simulations, sample_size, alg)
{
    time_results <- list()
    results <- list()
    estimates_sim <- list()
    fe_results <- list() # track number of function calls
    convergence <- list() # track convergence status
    ##
    ## fe_results and convergence were not used in the final evaluation, because they didnt carry any interesting information
    ##
    n_simulations <- n_simulations
    sample_size <- sample_size
    for (error in error_types)
    {
      for (censor in censoring)
          {
              for (method in methods)
              { 
                estimates <- matrix(NA, nrow = n_simulations, ncol = length(true_coefficients))
                times <- numeric(n_simulations) # To store elapsed time
                fe_calls <- numeric(n_simulations) # To store function calls  
                convergence_calls <- numeric(n_simulations)

                for (i in 1:n_simulations)
                {
                  # Generate data
                  set.seed(seeds[i])

                  censorNEW <- NaN
                  
                  if(censor == 0) {censorNEW <- -1}
                  if(censor == 25){censorNEW <- 100}
                  if(censor == 50) {censorNEW <- 30}
                  if(censor == 90) {censorNEW <- 3}

                  data <- datgen_model1(n = sample_size, error = error, censor = censorNEW)

                  # Measure time of model fitting
                  time_taken <- system.time({
                    fit <- aftsem(Surv(log(data$Time), data$status) ~ data$x1 + data$x2 + data$x3, method = method, control = list(use.grad = FALSE, optimx.alg = alg,variance.estimation = FALSE, gehan_eps = 10^-6))
                    # Adjust this line based on how you extract estimates
                    estimates[i, ] <- c(fit$beta)
                    fe_calls[i] <- fit$fe
                    convergence_calls[i] <- fit$converged
                  })

                  times[i] <- time_taken[3]
                }

                bias <- colMeans(estimates) - true_coefficients
                mse <- colMeans((estimates - true_coefficients)^2)
                avg_fe_calls <- mean(fe_calls)
                avg_time <- mean(times)
                succesive_convergence <- sum(convergence_calls[convergence_calls == TRUE])

                results[[paste(error, method, censor, sep = "_")]] <- list(bias = bias, mse = mse)
                time_results[[paste(error, method, censor, sep = "_")]] <- avg_time
                fe_results[[paste(error, method, censor, sep = "_")]] <- avg_fe_calls
                estimates_sim[[paste(error, method, censor, sep = "_")]] <- estimates
                convergence[[paste(error, method, censor, sep = "_")]] <- succesive_convergence
              }
        }
    }
    saveRDS(time_results, paste("time_results",alg,sample_size, ".rds", sep = "_"))
    saveRDS(results, paste("results",alg,sample_size,".rds", sep = "_"))
    saveRDS(estimates_sim, paste("estimates_sim",alg,sample_size,".rds", sep = "_"))
    saveRDS(fe_results, paste("fe_results",alg,sample_size,".rds",sep = "_"))
}

In [None]:
run_simulation_for_gehans(100,800,"L-BFGS")

In [None]:
# sanity check
#tr <- readRDS("time_results_Nelder-Mead_800_.rds")
#r <- readRDS("results_Nelder-Mead_800_.rds")
#es <- readRDS("estimates_sim_BFGS_400_.rds")
#fe_r <- readRDS("fe_results_L-BFGS_400_.rds")

### See the results
This summary dataframe has the same structure as the previous one

In [None]:
summary_df <- do.call(rbind, lapply(names(r), function(name) {
  data.frame(
    Scenario = name,
    Bias = r[[name]]$bias,
    MSE = r[[name]]$mse,
    Avg_Time = tr[[name]]
  )
}))

In [None]:
summary_df

### New simulation config
We are now focusing on impact of initial beta estimation. User of package <b>aftsem</b> can choose three different initial beta estimations: gehan, lm(ordinary least squares solution) or custom numeric vector (in this simulation case it is vector of zeros)

In [None]:
bb = c("zero","lm","gehan")
censoring <- c(25,50,90) # again not include scenario with 0 censoring
error_types <- c("normal","extreme","logistic")
methods <- c("gehan-poly", "gehan-heller")

### Different initial beta guess simulation
The function takes two parameters
<ul>
    <li>n_simulations=number of simulations</li>
    <li>sample_size=number of observations of data</li>
</ul>
The simulation again track fe and convergence results, however they are also not used in thesis evaluation, because i didnt found them interesting

In [None]:
run_simulations_for_binit <- function(n_simulations, sample_size)
{
    time_results <- list()
    results <- list()
    estimates_sim <- list()
    fe_results <- list()
    convergence <- list()
    n_simulations <- n_simulations
    sample_size <- sample_size
    for (error in error_types)
    {
      for (censor in censoring)
          {
              for (method in methods)
              {
                
                for (b in bb)
                {
                    estimates <- matrix(NA, nrow = n_simulations, ncol = length(true_coefficients))
                    times <- numeric(n_simulations) # To store elapsed time
                    fe_calls <- numeric(n_simulations) # To store function calls  
                    convergence_calls <- numeric(n_simulations)

                    for (i in 1:n_simulations)
                    {
                      # Generate data
                      set.seed(seeds[i])

                      censorNEW <- NaN
                  
                      if(censor == 0) {censorNEW <- -1}
                      if(censor == 25){censorNEW <- 100}
                      if(censor == 50) {censorNEW <- 30}
                      if(censor == 90) {censorNEW <- 3}

                      data <- datgen_model1(n = sample_size, error = error, censor = censorNEW)
                      binit_value <- b  
                      if (binit_value == "zero")
                      {
                          binit_value = rep(0,3)
                      }

                      # Measure time of model fitting
                      time_taken <- system.time({
                        fit <- aftsem(Surv(log(data$Time), data$status) ~ data$x1 + data$x2 + data$x3, method = method, binit = binit_value, control = list(use.grad = FALSE, optimx.alg = "BFGS",variance.estimation = FALSE, gehan_eps = 10^-6, quantile.method = "br"))
                        # Adjust this line based on how you extract estimates
                        estimates[i, ] <- c(fit$beta)
                        fe_calls[i] <- fit$fe
                        convergence_calls[i] <- fit$converged
                      })

                      # Store elapsed time
                      times[i] <- time_taken[3]
                  
                    }

                
                bias <- colMeans(estimates) - true_coefficients
                mse <- colMeans((estimates - true_coefficients)^2)
                avg_fe_calls <- mean(fe_calls)
                avg_time <- mean(times)
                succesive_convergence <- sum(convergence_calls[convergence_calls == TRUE])

                
                results[[paste(error, method, censor, b, sep = "_")]] <- list(bias = bias, mse = mse)
                time_results[[paste(error, method, censor, b, sep = "_")]] <- avg_time
                fe_results[[paste(error, method, censor, b, sep = "_")]] <- avg_fe_calls
                estimates_sim[[paste(error, method, censor, b, sep = "_")]] <- estimates
                convergence[[paste(error, method, censor, b, sep = "_")]] <- succesive_convergence
              }
            }
        }
    }
    saveRDS(time_results, paste("time_results3","binit", ".rds", sep = "_"))
    saveRDS(results, paste("results3","binit",".rds", sep = "_"))
    saveRDS(estimates_sim, paste("estimates_sim3","binit",".rds", sep = "_"))
    saveRDS(fe_results, paste("fe_results3","binit",".rds",sep = "_"))
}

In [None]:
run_simulations_for_binit(n_simulations = 500,sample_size = 100)

In [None]:
# Sanity check, check the simulated data
#rrr <- readRDS("results2_binit_.rds")
#ttt <- readRDS("time_results2_binit_.rds")

### Create dataframe of results

In [None]:
summary_df <- do.call(rbind, lapply(names(rrr), function(name) {
  data.frame(
    Scenario = name,
    Bias = rrr[[name]]$bias,
    MSE = rrr[[name]]$mse,
    Avg_Time = ttt[[name]]
  )
}))

In [None]:
options(repr.matrix.max.rows = 150, repr.matrix.max.cols = 10)
display(summary_df)

### New simulation config
We are interested in effect of choosing different quantile regression algorithms in package quatreg

In [None]:
censoring <- c(25,50,90)
error_types <- c("normal","extreme","logistic")
methods <- c("gehan")
algs <- c("fn","pfn")

### Different quantile regression algorithms
The function has the same structure as previous and takes two parameters
<ul>
    <li>n_simulations=number of simulations</li>
    <li>sample_size=number of observations of each dataset</li>
</ul>

In [None]:
run_simulations_for_median_reg <- function(n_simulations, sample_size)
{
    time_results <- list()
    results <- list()
    estimates_sim <- list()
    n_simulations <- n_simulations
    sample_size <- sample_size
    
    for (error in error_types)
    {
      for (censor in censoring)
          {
              for (a in algs)
              {
                estimates <- matrix(NA, nrow = n_simulations, ncol = length(true_coefficients))
                times <- numeric(n_simulations) # To store elapsed time

                for (i in 1:n_simulations)
                {
                     # Generate data
                     set.seed(seeds[i])

                    censorNEW <- NaN
                  
                    if(censor == 0) {censorNEW <- -1}
                    if(censor == 25){censorNEW <- 100}
                    if(censor == 50) {censorNEW <- 30}
                    if(censor == 90) {censorNEW <- 3}

                    data <- datgen_model1(n = sample_size, error = error, censor = censorNEW)

                    time_taken <- system.time({
                       fit <- aftsem(Surv(log(data$Time), data$status) ~ data$x1 + data$x2 + data$x3, method = "gehan", control = list(quantile.method = a))
                        estimates[i, ] <- c(fit$beta)
                      })

                      
                      times[i] <- time_taken[3]
                  
                    }

                
                bias <- colMeans(estimates) - true_coefficients
                mse <- colMeans((estimates - true_coefficients)^2)
                avg_time <- mean(times)

                
                results[[paste(error, a, censor, sep = "_")]] <- list(bias = bias, mse = mse)
                time_results[[paste(error, a, censor, sep = "_")]] <- avg_time
                estimates_sim[[paste(error, a, censor, sep = "_")]] <- estimates
              }
        }
    }
    saveRDS(time_results, paste("time_results","median", ".rds", sep = "_"))
    saveRDS(results, paste("results","median",".rds", sep = "_"))
    saveRDS(estimates_sim, paste("estimates_sim","median",".rds", sep = "_"))
}

In [None]:
run_simulations_for_median_reg(300,400)

In [None]:
# sanity check
#tt <- readRDS("time_results_400_.rds")
#tm <- readRDS("time_results_median_.rds")

### See the results in dataframe

In [None]:
summary_df <- do.call(rbind, lapply(names(rm), function(name) {
  data.frame(
    Scenario = name,
    Bias = rm[[name]]$bias,
    MSE = rm[[name]]$mse,
    Avg_Time = tm[[name]]
  )
}))

In [None]:
summary_df

In [None]:
selected_items <- tt[grep("_(gehan)_(25|50|90)$", names(tt))]
selected_names <- names(selected_items)
names(selected_items) <- sub("gehan", "br", selected_names)
print(selected_items)

In [None]:
tm <- c(tm, selected_items)
tm

### Plot the time results of different median regression algorithms
The function is basically the same as previous time plotting function

In [None]:
plot_time_median_reg <- function(data_list)
{    
    # first convert our list to dataframe using pipes
    data <- enframe(data_list, name = "metric", value = "value") %>%
    mutate(
        metric = as.character(metric),
        distribution = gsub("([a-z]+)_.*", "\\1", metric),
        method = gsub("[a-z]+_(.*?)(_\\d+)?$", "\\1", metric),
        percent_censoring = as.numeric(gsub(".*_(\\d+)$", "\\1", metric)),
        value = as.numeric(unlist(value))
    ) %>%
    select(distribution, method, percent_censoring, value)

    # sanity check 
    print(data)

    # plot our result!
    p1 <- ggplot(data, aes(x = percent_censoring, y = value, color = distribution, shape = method)) +
    geom_point(size = 3, alpha = 0.6) +  
    labs(
        title = "Scatter Plot časových výsledků u algoritmů mediánové regrese",
        x = "Míra Cenzorování[%]",
        y = "Čas[s]",
        color = "Rozdělení dat",
        shape = "Algoritmus"
    ) +
    theme_minimal() +
    scale_color_brewer(palette = "Set1") +  
    theme(
        legend.position = "right",
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 10)
    )
    ggsave(filename = paste0("median_400", ".png"), plot = p1)
}

In [None]:
plot_time_median_reg(tm)