In [1]:

######## snakemake preamble start (automatically inserted, do not edit) ########
library(methods)
Snakemake <- setClass(
    "Snakemake",
    slots = c(
        input = "list",
        output = "list",
        params = "list",
        wildcards = "list",
        threads = "numeric",
        log = "list",
        resources = "list",
        config = "list",
        rule = "character",
        bench_iteration = "numeric",
        scriptdir = "character",
        source = "function"
    )
)
snakemake <- Snakemake(
    input = list('data/cases_controls/cases_controls_rds_format/all_samples_normalized_trimmed_CONTROLS.rds', "input_norm_trimmed_LASSO_TRAIN_DATA" = 'data/cases_controls/cases_controls_rds_format/all_samples_normalized_trimmed_CONTROLS.rds'),
    output = list('data/20_80_splits_group_by_ind/20_all_samples_normalized_trimmed_CONTROLS.rds', 'data/20_80_splits_group_by_ind/80_all_samples_normalized_trimmed_CONTROLS.rds', "OUTPUT_norm_trimmed_LASSO_DATA_20" = 'data/20_80_splits_group_by_ind/20_all_samples_normalized_trimmed_CONTROLS.rds', "OUTPUT_norm_trimmed_LASSO_DATA_80" = 'data/20_80_splits_group_by_ind/80_all_samples_normalized_trimmed_CONTROLS.rds'),
    params = list(),
    wildcards = list(),
    threads = 3,
    log = list('logs/processed_notebooks/processed_Data_splitting_20_80_group_by_individual_norm_trimmed.r.ipynb', "notebook" = 'logs/processed_notebooks/processed_Data_splitting_20_80_group_by_individual_norm_trimmed.r.ipynb'),
    resources = list('mem_mb', 'disk_mb', 'tmpdir', 'time', "mem_mb" = 150000, "disk_mb" = 5953, "tmpdir" = '/scratch/56255143', "time" = '5:00:00'),
    config = list(),
    rule = 'splitting_norm_trimmed_group_by_individual',
    bench_iteration = as.numeric(NA),
    scriptdir = '/faststorage/project/DELFI1/Workspaces/CarmenAndAnika/20_80_experiments',
    source = function(...){
        wd <- getwd()
        setwd(snakemake@scriptdir)
        source(...)
        setwd(wd)
    }
)
setwd('/faststorage/project/DELFI1/Workspaces/CarmenAndAnika');

######## snakemake preamble end #########


In [2]:
library(tidyverse)

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.1     ✔ purrr   0.3.2
✔ tibble  3.1.5     ✔ dplyr   1.0.7
✔ tidyr   1.1.4     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.4.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [3]:
INPUT_std_LASSO_TRAIN_DATA = readRDS(snakemake@input[["input_norm_trimmed_LASSO_TRAIN_DATA"]])
#INPUT_smooth_LASSO_TRAIN_DATA = readRDS(snakemake@input[["input_smooth_LASSO_TRAIN_DATA"]])
#INPUT_trim_LASSO_TRAIN_DATA = readRDS(snakemake@input[["input_trim_LASSO_TRAIN_DATA"]])

In [4]:
head(INPUT_std_LASSO_TRAIN_DATA)

sample,bin,X80,X81,X82,X83,X84,X85,X86,X87,...,X391,X392,X393,X394,X395,X396,X397,X398,X399,X400
PGDX16568P,chr10_400,0.0,0.0,0.0,0,0.0,0.0001749781,0,0.0,...,0.0001749781,0.0003499563,0.0,0.0001749781,0.0,0.0001749781,0.0,0.0,0.0,0.0001749781
PGDX16568P,chr10_40,0.0,0.0,0.0002340276,0,0.0002340276,0.0,0,0.0,...,0.0002340276,0.0002340276,0.0,0.0002340276,0.0,0.0,0.0002340276,0.0,0.0002340276,0.0
PGDX16568P,chr10_4,0.0,0.0002093802,0.0002093802,0,0.0,0.0002093802,0,0.0,...,0.0,0.0,0.0,0.0004187605,0.0002093802,0.0,0.0,0.0002093802,0.0,0.0
PGDX16568P,chr10_401,0.0,0.0,0.0,0,0.0,0.0,0,0.000203666,...,0.000407332,0.0,0.000407332,0.0,0.0,0.0,0.0,0.000203666,0.0,0.000203666
PGDX16568P,chr10_402,0.0,0.0002337541,0.0,0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0002337541,0.0,0.0,0.0004675082,0.0004675082
PGDX16568P,chr10_403,0.0002130379,0.0,0.0,0,0.0,0.0,0,0.0,...,0.0004260758,0.0002130379,0.0002130379,0.0002130379,0.0002130379,0.0,0.0,0.0,0.0,0.0


In [5]:
INPUT_std_LASSO_TRAIN_DATA %>%             
  summarise(Unique_Elements = n_distinct(sample))

ind_list = INPUT_std_LASSO_TRAIN_DATA %>% distinct(sample)
ind_list = ind_list$sample
length(ind_list)


set.seed(0)
training <- sample(ind_list, 194, replace = FALSE)

std_LASSO_train_20 <- INPUT_std_LASSO_TRAIN_DATA %>% filter(!sample %in% training)
std_LASSO_test_80 <- INPUT_std_LASSO_TRAIN_DATA %>% filter(sample %in% training)

dim(INPUT_std_LASSO_TRAIN_DATA)
dim(std_LASSO_train_20)
dim(std_LASSO_test_80)

Unique_Elements
243


In [6]:
saveRDS(std_LASSO_test_80, file = snakemake@output[["OUTPUT_norm_trimmed_LASSO_DATA_80"]])
saveRDS(std_LASSO_train_20, file = snakemake@output[["OUTPUT_norm_trimmed_LASSO_DATA_20"]])