In [1]:

######## snakemake preamble start (automatically inserted, do not edit) ########
library(methods)
Snakemake <- setClass(
    "Snakemake",
    slots = c(
        input = "list",
        output = "list",
        params = "list",
        wildcards = "list",
        threads = "numeric",
        log = "list",
        resources = "list",
        config = "list",
        rule = "character",
        bench_iteration = "numeric",
        scriptdir = "character",
        source = "function"
    )
)
snakemake <- Snakemake(
    input = list('data/sample_types.txt', 'data/data_rds_format/all_samples_normalized_trimmed.rds', "input_samples" = 'data/sample_types.txt', "input_data" = 'data/data_rds_format/all_samples_normalized_trimmed.rds'),
    output = list('data/cases_controls/cases_controls_rds_format/all_samples_normalized_trimmed_CASES.rds', 'data/cases_controls/cases_controls_rds_format/all_samples_normalized_trimmed_CONTROLS.rds', "output_cases_normalized_trimmed" = 'data/cases_controls/cases_controls_rds_format/all_samples_normalized_trimmed_CASES.rds', "output_controls_normalized_trimmed" = 'data/cases_controls/cases_controls_rds_format/all_samples_normalized_trimmed_CONTROLS.rds'),
    params = list(),
    wildcards = list(),
    threads = 1,
    log = list('logs/processed_notebooks/processed_splitting_cases_controls_norm_trimmed.r.ipynb', "notebook" = 'logs/processed_notebooks/processed_splitting_cases_controls_norm_trimmed.r.ipynb'),
    resources = list('mem_mb', 'disk_mb', 'tmpdir', 'time', "mem_mb" = 150000, "disk_mb" = 11139, "tmpdir" = '/scratch/56249438', "time" = '5:00:00'),
    config = list(),
    rule = 'cases_controls_normalized_trimmed',
    bench_iteration = as.numeric(NA),
    scriptdir = '/faststorage/project/DELFI1/Workspaces/CarmenAndAnika/Notebooks for data prep, formatting, normalizing/splitting_cases_controls',
    source = function(...){
        wd <- getwd()
        setwd(snakemake@scriptdir)
        source(...)
        setwd(wd)
    }
)
setwd('/faststorage/project/DELFI1/Workspaces/CarmenAndAnika');

######## snakemake preamble end #########


In [2]:
library(tidyverse)

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.1     ✔ purrr   0.3.2
✔ tibble  3.1.5     ✔ dplyr   1.0.7
✔ tidyr   1.1.4     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.4.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [3]:
filename = snakemake@input[["input_data"]]

In [4]:
data <- readRDS(filename)

## Sample types

In [5]:
sample_types <- read.table(snakemake@input[["input_samples"]])
head(sample_types)

V1,V2
PGDX10344P1,Gastric_cancer
PGDX10346P1,Gastric_cancer
PGDX10349P1,Gastric_cancer
PGDX10351P1,Gastric_cancer
PGDX10566P,Ovarian_Cancer
PGDX10567P,Ovarian_Cancer


In [6]:
healthy_individuals <- sample_types %>% filter(V2 == "Healthy") %>% select(V1)
head(healthy_individuals)

V1
PGDX16568P
PGDX16569P
PGDX16570P
PGDX16571P
PGDX16579P
PGDX16580P


In [7]:
cancer_individuals <- sample_types %>% filter(V2 != "Healthy") %>% select(V1)
head(cancer_individuals)

V1
PGDX10344P1
PGDX10346P1
PGDX10349P1
PGDX10351P1
PGDX10566P
PGDX10567P


## Controls

In [8]:
data_controls <- data %>% filter(sample %in% healthy_individuals$V1)

head(data_controls)

sample,bin,X80,X81,X82,X83,X84,X85,X86,X87,...,X391,X392,X393,X394,X395,X396,X397,X398,X399,X400
PGDX16568P,chr10_400,0.0,0.0,0.0,0,0.0,0.0001749781,0,0.0,...,0.0001749781,0.0003499563,0.0,0.0001749781,0.0,0.0001749781,0.0,0.0,0.0,0.0001749781
PGDX16568P,chr10_40,0.0,0.0,0.0002340276,0,0.0002340276,0.0,0,0.0,...,0.0002340276,0.0002340276,0.0,0.0002340276,0.0,0.0,0.0002340276,0.0,0.0002340276,0.0
PGDX16568P,chr10_4,0.0,0.0002093802,0.0002093802,0,0.0,0.0002093802,0,0.0,...,0.0,0.0,0.0,0.0004187605,0.0002093802,0.0,0.0,0.0002093802,0.0,0.0
PGDX16568P,chr10_401,0.0,0.0,0.0,0,0.0,0.0,0,0.000203666,...,0.000407332,0.0,0.000407332,0.0,0.0,0.0,0.0,0.000203666,0.0,0.000203666
PGDX16568P,chr10_402,0.0,0.0002337541,0.0,0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0002337541,0.0,0.0,0.0004675082,0.0004675082
PGDX16568P,chr10_403,0.0002130379,0.0,0.0,0,0.0,0.0,0,0.0,...,0.0004260758,0.0002130379,0.0002130379,0.0002130379,0.0002130379,0.0,0.0,0.0,0.0,0.0


In [9]:
dim(data_controls)

In [10]:
saveRDS(data_controls, file = snakemake@output[["output_controls_normalized_trimmed"]])

## Cases

In [11]:
data_cases <- data %>% filter(sample %in% cancer_individuals$V1)

head(data_cases)

sample,bin,X80,X81,X82,X83,X84,X85,X86,X87,...,X391,X392,X393,X394,X395,X396,X397,X398,X399,X400
PGDX10344P1,chr10_400,0,0.0,0.0,0.0,0,0.0002261932,0.0002261932,0.0,...,0.0002261932,0.0004523863,0.0,0.0002261932,0.0,0.0002261932,0.0,0.0,0.0,0.0002261932
PGDX10344P1,chr10_40,0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0.0,0.0002773925,0.0002773925,0.0002773925,0.0002773925,0.0,0.0002773925,0.0002773925,0.0002773925
PGDX10344P1,chr10_4,0,0.0,0.0002667378,0.0,0,0.0,0.0002667378,0.0,...,0.0002667378,0.0005334756,0.0002667378,0.0002667378,0.0,0.0005334756,0.0,0.0,0.0,0.0
PGDX10344P1,chr10_401,0,0.0,0.0,0.0002428363,0,0.0002428363,0.0,0.0002428363,...,0.0002428363,0.0,0.0,0.0,0.0,0.0,0.0,0.000728509,0.0004856727,0.0
PGDX10344P1,chr10_402,0,0.0,0.0,0.0002801905,0,0.0,0.0002801905,0.0,...,0.0002801905,0.0002801905,0.0002801905,0.0005603811,0.0002801905,0.0,0.0002801905,0.0002801905,0.0005603811,0.0002801905
PGDX10344P1,chr10_403,0,0.0005206977,0.0,0.0,0,0.0002603489,0.0,0.0,...,0.0002603489,0.0,0.0,0.0005206977,0.0,0.0002603489,0.0002603489,0.0002603489,0.0002603489,0.0005206977


In [12]:
dim(data_cases)

In [13]:
saveRDS(data_cases, file = snakemake@output[["output_cases_normalized_trimmed"]])