## Discarding start and end of histograms

__This notebook discards the starts and ends of histograms for all individuals and the summed controls. The input file contains the already normalized (and smoothed in some cases) histograms. The input file is read in in chunks, histograms trimmed and then written into the output file.__

In [2]:
# packages

library(tidyverse)

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.1     ✔ purrr   0.3.2
✔ tibble  3.1.4     ✔ dplyr   1.0.7
✔ tidyr   1.1.3     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.4.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


### 1) Discarding start and end of histograms for all normalized individuals

In [2]:
#inputs
filename = "../data/all_samples_normalized.csv"

In [3]:
small_test <- read_csv(file = filename, col_names = T, comment = "#", skip = 0, n_max = 10)
head(small_test)

Parsed with column specification:
cols(
  .default = col_double(),
  sample = col_character(),
  bin = col_character()
)
See spec(...) for full column specifications.


sample,bin,30,31,32,33,34,35,36,37,...,691,692,693,694,695,696,697,698,699,700
PGDX10344P1,chr10_400,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_40,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_401,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_402,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_403,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
cols_trimmed_first_two <- colnames(small_test)[1:2]
cols_trimmed_rest <- colnames(small_test)[53:373]
cols_trimmed_rest <- paste("X", cols_trimmed_rest, sep = "")

cols_trimmed = as.data.frame(t(c(cols_trimmed_first_two, cols_trimmed_rest)))
cols_trimmed

V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323
sample,bin,X80,X81,X82,X83,X84,X85,X86,X87,...,X391,X392,X393,X394,X395,X396,X397,X398,X399,X400


In [9]:
write.table(NULL, file="../data/all_samples_normalized_trimmed.csv", 
             append = F, 
             sep=',', 
             row.names=F, 
             col.names=F)

write.table(cols_trimmed, "../data/all_samples_normalized_trimmed.csv", append = F, sep=',', col.names=F, row.names=F)

In [10]:
f <- function(df1, pos) {
    trimmed_first_two <- df1[,1:2]
    trimmed_rest <- df1[,53:373]
    
    output <- cbind(trimmed_first_two, trimmed_rest)

    write.table(output, "../data/all_samples_normalized_trimmed.csv", append = TRUE, sep=',', col.names=F, row.names=F)
    }


df <- read_delim_chunked(file = filename, 
                 callback = DataFrameCallback$new(f), 
                 chunk_size = 10000, col_names = T,
                 delim = ",", 
                 progress = show_progress())

Parsed with column specification:
cols(
  .default = col_double(),
  sample = col_character(),
  bin = col_character()
)
See spec(...) for full column specifications.


### 2) Discarding start and end of histograms for all the normalized and smoothed individuals

In [3]:
#inputs
filename = "../data/all_samples_normalized_gaussian_smooth.csv"

In [4]:
small_test <- read_csv(file = filename, col_names = T, comment = "#", skip = 0, n_max = 10)
head(small_test)

Parsed with column specification:
cols(
  .default = col_double(),
  sample = col_character(),
  bin = col_character()
)
See spec(...) for full column specifications.


sample,bin,30,31,32,33,34,35,36,37,...,691,692,693,694,695,696,697,698,699,700
PGDX10344P1,chr10_400,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_40,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_401,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_402,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_403,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
cols_trimmed_first_two <- colnames(small_test)[1:2]
cols_trimmed_rest <- colnames(small_test)[53:373]
cols_trimmed_rest <- paste("X", cols_trimmed_rest, sep = "")

cols_trimmed = as.data.frame(t(c(cols_trimmed_first_two, cols_trimmed_rest)))
cols_trimmed

V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323
sample,bin,X80,X81,X82,X83,X84,X85,X86,X87,...,X391,X392,X393,X394,X395,X396,X397,X398,X399,X400


In [6]:
write.table(NULL, file="../data/all_samples_normalized_gaussian_smooth_trimmed.csv", 
             append = F, 
             sep=',', 
             row.names=F, 
             col.names=F)

write.table(cols_trimmed, "../data/all_samples_normalized_gaussian_smooth_trimmed.csv", append = F, sep=',', col.names=F, row.names=F)

In [7]:
f <- function(df1, pos) {
    trimmed_first_two <- df1[,1:2]
    trimmed_rest <- df1[,53:373]
    
    output <- cbind(trimmed_first_two, trimmed_rest)

    write.table(output, "../data/all_samples_normalized_gaussian_smooth_trimmed.csv", append = TRUE, sep=',', col.names=F, row.names=F)
    }


df <- read_delim_chunked(file = filename, 
                 callback = DataFrameCallback$new(f), 
                 chunk_size = 10000, col_names = T,
                 delim = ",", 
                 progress = show_progress())

Parsed with column specification:
cols(
  .default = col_double(),
  sample = col_character(),
  bin = col_character()
)
See spec(...) for full column specifications.


### 3) Discarding start and end of histograms for all the normalized and smoothed summed controls

In [8]:
sum_control_ATAC_bin_rm_gaussian_mean <- read.csv("../data/sum_control_normalized_gaussian_smooth_ATAC_bin_rm.csv")
ATAC_value <- sum_control_ATAC_bin_rm_gaussian_mean %>% select(ATAC_val)
head(ATAC_value)
sum_control_ATAC_bin_rm_gaussian_mean = sum_control_ATAC_bin_rm_gaussian_mean %>% select(-ATAC_val)

head(sum_control_ATAC_bin_rm_gaussian_mean)

ATAC_val
0.5931657
0.4089315
0.3058228
0.4764137
0.2862775
0.3169988


X30,X31,X32,X33,X34,X35,X36,X37,X38,X39,...,X691,X692,X693,X694,X695,X696,X697,X698,X699,X700
0,0,0,0,0,0,1.295662e-06,1.31524e-06,1.377055e-06,1.493326e-06,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
0,0,0,0,0,0,1.898063e-06,2.292049e-06,2.673739e-06,2.929634e-06,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
0,0,0,0,0,0,1.114137e-06,1.158812e-06,1.143512e-06,1.1428e-06,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
0,0,0,0,0,0,8.29244e-07,8.302439e-07,8.203466e-07,8.4553e-07,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
0,0,0,0,0,0,6.358502e-07,7.854683e-07,1.037024e-06,1.299074e-06,...,1.429406e-07,1.337957e-07,1.097238e-07,7.883745e-08,4.962914e-08,0,0,0,0,0
0,0,0,0,0,0,1.369131e-06,1.302297e-06,1.18174e-06,1.057154e-06,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0


In [11]:
sum_control_normalized_gaussian_trimmed <- sum_control_ATAC_bin_rm_gaussian_mean %>% select(X80:X400)
sum_control_normalized_gaussian_trimmed <- sum_control_normalized_gaussian_trimmed %>% mutate(ATAC_val = ATAC_value$ATAC_val)
head(sum_control_normalized_gaussian_trimmed)
dim(sum_control_normalized_gaussian_trimmed)


X80,X81,X82,X83,X84,X85,X86,X87,X88,X89,...,X392,X393,X394,X395,X396,X397,X398,X399,X400,ATAC_val
1.941075e-05,2.079943e-05,2.162218e-05,2.244632e-05,2.353735e-05,2.524666e-05,2.846163e-05,3.416138e-05,4.175699e-05,5.068301e-05,...,7.389605e-05,7.064061e-05,6.691617e-05,6.300203e-05,5.890176e-05,5.505384e-05,5.169955e-05,4.84625e-05,4.546832e-05,0.5931657
3.340475e-05,3.604897e-05,3.813424e-05,3.972881e-05,4.136925e-05,4.373924e-05,4.763574e-05,5.38764e-05,6.273863e-05,7.350752e-05,...,9.652966e-05,9.020873e-05,8.49957e-05,8.022667e-05,7.591143e-05,7.223015e-05,6.880952e-05,6.539239e-05,6.217557e-05,0.4089315
1.827843e-05,1.906753e-05,1.961684e-05,2.002694e-05,2.043854e-05,2.184018e-05,2.401005e-05,2.806282e-05,3.377775e-05,4.125139e-05,...,9.453006e-05,9.03274e-05,8.597152e-05,8.156974e-05,7.759355e-05,7.430006e-05,7.126369e-05,6.859701e-05,6.60489e-05,0.3058228
1.889019e-05,2.052863e-05,2.16671e-05,2.231962e-05,2.285308e-05,2.389903e-05,2.59113e-05,2.996872e-05,3.643993e-05,4.40015e-05,...,7.566216e-05,7.139018e-05,6.710408e-05,6.30336e-05,5.923884e-05,5.550531e-05,5.198491e-05,4.928195e-05,4.691004e-05,0.4764137
1.210448e-05,1.32552e-05,1.430822e-05,1.50718e-05,1.577416e-05,1.690351e-05,1.88152e-05,2.241909e-05,2.754345e-05,3.405389e-05,...,8.331454e-05,7.885015e-05,7.475785e-05,7.066735e-05,6.70336e-05,6.344876e-05,6.03222e-05,5.786202e-05,5.525848e-05,0.2862775
1.368989e-05,1.531414e-05,1.667634e-05,1.768305e-05,1.861072e-05,2.007812e-05,2.246905e-05,2.612024e-05,3.174561e-05,3.872263e-05,...,8.273963e-05,7.837361e-05,7.445714e-05,7.078112e-05,6.741255e-05,6.405234e-05,6.083425e-05,5.773046e-05,5.479946e-05,0.3169988


In [12]:
write.table(sum_control_normalized_gaussian_trimmed, "../data/sum_control_normalized_gaussian_smooth_trimmed_ATAC_bin_rm.csv", append = F, sep=',', col.names=T, row.names=F)