## Discarding start and end of histograms

__This notebook discards the starts and ends of histograms for all individuals and the summed controls. The input file contains the already normalized (and smoothed in some cases) histograms. The input file is read in in chunks, histograms trimmed and then written into the output file.__

In [2]:
# packages

library(tidyverse)

── Attaching packages ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.3.5     ✔ purrr   0.3.4
✔ tibble  3.1.5     ✔ dplyr   1.0.7
✔ tidyr   1.1.4     ✔ stringr 1.4.0
✔ readr   2.0.2     ✔ forcats 0.5.1
── Conflicts ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


### 1) Discarding start and end of histograms for all normalized individuals

In [2]:
#inputs
filename = "../data/all_samples_normalized.csv"

In [3]:
small_test <- read_csv(file = filename, col_names = T, comment = "#", skip = 0, n_max = 10)
head(small_test)

Parsed with column specification:
cols(
  .default = col_double(),
  sample = col_character(),
  bin = col_character()
)
See spec(...) for full column specifications.


sample,bin,30,31,32,33,34,35,36,37,...,691,692,693,694,695,696,697,698,699,700
PGDX10344P1,chr10_400,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_40,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_401,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_402,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_403,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
cols_trimmed_first_two <- colnames(small_test)[1:2]
cols_trimmed_rest <- colnames(small_test)[53:373]
cols_trimmed_rest <- paste("X", cols_trimmed_rest, sep = "")

cols_trimmed = as.data.frame(t(c(cols_trimmed_first_two, cols_trimmed_rest)))
cols_trimmed

V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323
sample,bin,X80,X81,X82,X83,X84,X85,X86,X87,...,X391,X392,X393,X394,X395,X396,X397,X398,X399,X400


In [9]:
write.table(NULL, file="../data/all_samples_normalized_trimmed.csv", 
             append = F, 
             sep=',', 
             row.names=F, 
             col.names=F)

write.table(cols_trimmed, "../data/all_samples_normalized_trimmed.csv", append = F, sep=',', col.names=F, row.names=F)

In [10]:
f <- function(df1, pos) {
    trimmed_first_two <- df1[,1:2]
    trimmed_rest <- df1[,53:373]
    
    output <- cbind(trimmed_first_two, trimmed_rest)

    write.table(output, "../data/all_samples_normalized_trimmed.csv", append = TRUE, sep=',', col.names=F, row.names=F)
    }


df <- read_delim_chunked(file = filename, 
                 callback = DataFrameCallback$new(f), 
                 chunk_size = 10000, col_names = T,
                 delim = ",", 
                 progress = show_progress())

Parsed with column specification:
cols(
  .default = col_double(),
  sample = col_character(),
  bin = col_character()
)
See spec(...) for full column specifications.


### 2) Discarding start and end of histograms for all the normalized and smoothed individuals

In [3]:
#inputs
filename = "../data/all_samples_normalized_gaussian_smooth.csv"

In [4]:
small_test <- read_csv(file = filename, col_names = T, comment = "#", skip = 0, n_max = 10)
head(small_test)

Parsed with column specification:
cols(
  .default = col_double(),
  sample = col_character(),
  bin = col_character()
)
See spec(...) for full column specifications.


sample,bin,30,31,32,33,34,35,36,37,...,691,692,693,694,695,696,697,698,699,700
PGDX10344P1,chr10_400,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_40,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_401,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_402,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_403,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
cols_trimmed_first_two <- colnames(small_test)[1:2]
cols_trimmed_rest <- colnames(small_test)[53:373]
cols_trimmed_rest <- paste("X", cols_trimmed_rest, sep = "")

cols_trimmed = as.data.frame(t(c(cols_trimmed_first_two, cols_trimmed_rest)))
cols_trimmed

V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323
sample,bin,X80,X81,X82,X83,X84,X85,X86,X87,...,X391,X392,X393,X394,X395,X396,X397,X398,X399,X400


In [6]:
write.table(NULL, file="../data/all_samples_normalized_gaussian_smooth_trimmed.csv", 
             append = F, 
             sep=',', 
             row.names=F, 
             col.names=F)

write.table(cols_trimmed, "../data/all_samples_normalized_gaussian_smooth_trimmed.csv", append = F, sep=',', col.names=F, row.names=F)

In [7]:
f <- function(df1, pos) {
    trimmed_first_two <- df1[,1:2]
    trimmed_rest <- df1[,53:373]
    
    output <- cbind(trimmed_first_two, trimmed_rest)

    write.table(output, "../data/all_samples_normalized_gaussian_smooth_trimmed.csv", append = TRUE, sep=',', col.names=F, row.names=F)
    }


df <- read_delim_chunked(file = filename, 
                 callback = DataFrameCallback$new(f), 
                 chunk_size = 10000, col_names = T,
                 delim = ",", 
                 progress = show_progress())

Parsed with column specification:
cols(
  .default = col_double(),
  sample = col_character(),
  bin = col_character()
)
See spec(...) for full column specifications.


### 3) Discarding start and end of histograms for all the normalized and gaussian smoothed summed controls

In [8]:
sum_control_ATAC_bin_rm_gaussian_mean <- read.csv("../data/sum_control_normalized_gaussian_smooth_ATAC_bin_rm.csv")
ATAC_value <- sum_control_ATAC_bin_rm_gaussian_mean %>% select(ATAC_val)
head(ATAC_value)
sum_control_ATAC_bin_rm_gaussian_mean = sum_control_ATAC_bin_rm_gaussian_mean %>% select(-ATAC_val)

head(sum_control_ATAC_bin_rm_gaussian_mean)

ATAC_val
0.5931657
0.4089315
0.3058228
0.4764137
0.2862775
0.3169988


X30,X31,X32,X33,X34,X35,X36,X37,X38,X39,...,X691,X692,X693,X694,X695,X696,X697,X698,X699,X700
0,0,0,0,0,0,1.295662e-06,1.31524e-06,1.377055e-06,1.493326e-06,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
0,0,0,0,0,0,1.898063e-06,2.292049e-06,2.673739e-06,2.929634e-06,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
0,0,0,0,0,0,1.114137e-06,1.158812e-06,1.143512e-06,1.1428e-06,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
0,0,0,0,0,0,8.29244e-07,8.302439e-07,8.203466e-07,8.4553e-07,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
0,0,0,0,0,0,6.358502e-07,7.854683e-07,1.037024e-06,1.299074e-06,...,1.429406e-07,1.337957e-07,1.097238e-07,7.883745e-08,4.962914e-08,0,0,0,0,0
0,0,0,0,0,0,1.369131e-06,1.302297e-06,1.18174e-06,1.057154e-06,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0


In [11]:
sum_control_normalized_gaussian_trimmed <- sum_control_ATAC_bin_rm_gaussian_mean %>% select(X80:X400)
sum_control_normalized_gaussian_trimmed <- sum_control_normalized_gaussian_trimmed %>% mutate(ATAC_val = ATAC_value$ATAC_val)
head(sum_control_normalized_gaussian_trimmed)
dim(sum_control_normalized_gaussian_trimmed)


X80,X81,X82,X83,X84,X85,X86,X87,X88,X89,...,X392,X393,X394,X395,X396,X397,X398,X399,X400,ATAC_val
1.941075e-05,2.079943e-05,2.162218e-05,2.244632e-05,2.353735e-05,2.524666e-05,2.846163e-05,3.416138e-05,4.175699e-05,5.068301e-05,...,7.389605e-05,7.064061e-05,6.691617e-05,6.300203e-05,5.890176e-05,5.505384e-05,5.169955e-05,4.84625e-05,4.546832e-05,0.5931657
3.340475e-05,3.604897e-05,3.813424e-05,3.972881e-05,4.136925e-05,4.373924e-05,4.763574e-05,5.38764e-05,6.273863e-05,7.350752e-05,...,9.652966e-05,9.020873e-05,8.49957e-05,8.022667e-05,7.591143e-05,7.223015e-05,6.880952e-05,6.539239e-05,6.217557e-05,0.4089315
1.827843e-05,1.906753e-05,1.961684e-05,2.002694e-05,2.043854e-05,2.184018e-05,2.401005e-05,2.806282e-05,3.377775e-05,4.125139e-05,...,9.453006e-05,9.03274e-05,8.597152e-05,8.156974e-05,7.759355e-05,7.430006e-05,7.126369e-05,6.859701e-05,6.60489e-05,0.3058228
1.889019e-05,2.052863e-05,2.16671e-05,2.231962e-05,2.285308e-05,2.389903e-05,2.59113e-05,2.996872e-05,3.643993e-05,4.40015e-05,...,7.566216e-05,7.139018e-05,6.710408e-05,6.30336e-05,5.923884e-05,5.550531e-05,5.198491e-05,4.928195e-05,4.691004e-05,0.4764137
1.210448e-05,1.32552e-05,1.430822e-05,1.50718e-05,1.577416e-05,1.690351e-05,1.88152e-05,2.241909e-05,2.754345e-05,3.405389e-05,...,8.331454e-05,7.885015e-05,7.475785e-05,7.066735e-05,6.70336e-05,6.344876e-05,6.03222e-05,5.786202e-05,5.525848e-05,0.2862775
1.368989e-05,1.531414e-05,1.667634e-05,1.768305e-05,1.861072e-05,2.007812e-05,2.246905e-05,2.612024e-05,3.174561e-05,3.872263e-05,...,8.273963e-05,7.837361e-05,7.445714e-05,7.078112e-05,6.741255e-05,6.405234e-05,6.083425e-05,5.773046e-05,5.479946e-05,0.3169988


In [12]:
write.table(sum_control_normalized_gaussian_trimmed, "../data/sum_control_ATAC_bin_rm_gaussian_smooth_trimmed_CONTROLS.csv", append = F, sep=',', col.names=T, row.names=F)

### 4) Discarding start and end of histograms for all the normalized and rolling mean smoothed summed controls

In [11]:
sum_control_ATAC_bin_rm_sliding_mean <- read.csv("../data/sum_control_ATAC_bin_rm_sliding_mean.csv") 
head(sum_control_ATAC_bin_rm_sliding_mean)

X30,X31,X32,X33,X34,X35,X36,X37,X38,X39,...,X692,X693,X694,X695,X696,X697,X698,X699,X700,ATAC_val
1.449941e-06,1.491368e-06,1.522438e-06,1.353278e-06,1.47894e-06,1.423579e-06,1.581754e-06,1.581754e-06,1.502666e-06,1.581754e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5931657
8.313727e-07,8.313727e-07,1.454902e-06,1.662745e-06,1.82902e-06,2.040642e-06,2.116221e-06,2.342959e-06,2.494118e-06,2.645277e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4089315
7.254811e-07,9.949456e-07,1.305866e-06,1.16077e-06,1.131751e-06,1.028864e-06,1.028864e-06,9.497208e-07,1.108008e-06,1.108008e-06,...,0.0,0.0,0.0,7.91434e-08,8.705774e-08,9.673082e-08,1.088222e-07,1.243682e-07,1.450962e-07,0.3058228
8.503423e-07,8.503423e-07,7.440495e-07,7.558598e-07,7.653081e-07,7.730385e-07,8.503423e-07,8.503423e-07,9.276462e-07,8.503423e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4764137
3.138859e-07,4.035676e-07,3.531216e-07,6.277718e-07,5.649946e-07,7.704472e-07,8.560525e-07,1.198473e-06,1.369684e-06,1.455289e-06,...,8.560525e-08,8.560525e-08,8.560525e-08,8.560525e-08,0.0,0.0,0.0,0.0,0.0,0.2862775
1.266337e-06,1.206035e-06,1.055281e-06,1.219435e-06,1.097492e-06,1.151215e-06,1.151215e-06,1.151215e-06,1.227963e-06,9.977199e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3169988


In [12]:
ATAC_value <- sum_control_ATAC_bin_rm_sliding_mean %>% select(ATAC_val)
head(ATAC_value)
sum_control_ATAC_bin_rm_sliding_mean = sum_control_ATAC_bin_rm_sliding_mean %>% select(-ATAC_val)

head(sum_control_ATAC_bin_rm_sliding_mean)

ATAC_val
0.5931657
0.4089315
0.3058228
0.4764137
0.2862775
0.3169988


X30,X31,X32,X33,X34,X35,X36,X37,X38,X39,...,X691,X692,X693,X694,X695,X696,X697,X698,X699,X700
1.449941e-06,1.491368e-06,1.522438e-06,1.353278e-06,1.47894e-06,1.423579e-06,1.581754e-06,1.581754e-06,1.502666e-06,1.581754e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8.313727e-07,8.313727e-07,1.454902e-06,1.662745e-06,1.82902e-06,2.040642e-06,2.116221e-06,2.342959e-06,2.494118e-06,2.645277e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7.254811e-07,9.949456e-07,1.305866e-06,1.16077e-06,1.131751e-06,1.028864e-06,1.028864e-06,9.497208e-07,1.108008e-06,1.108008e-06,...,0.0,0.0,0.0,0.0,7.91434e-08,8.705774e-08,9.673082e-08,1.088222e-07,1.243682e-07,1.450962e-07
8.503423e-07,8.503423e-07,7.440495e-07,7.558598e-07,7.653081e-07,7.730385e-07,8.503423e-07,8.503423e-07,9.276462e-07,8.503423e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3.138859e-07,4.035676e-07,3.531216e-07,6.277718e-07,5.649946e-07,7.704472e-07,8.560525e-07,1.198473e-06,1.369684e-06,1.455289e-06,...,8.560525e-08,8.560525e-08,8.560525e-08,8.560525e-08,8.560525e-08,0.0,0.0,0.0,0.0,0.0
1.266337e-06,1.206035e-06,1.055281e-06,1.219435e-06,1.097492e-06,1.151215e-06,1.151215e-06,1.151215e-06,1.227963e-06,9.977199e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
sum_control_normalized_sliding_trimmed <- sum_control_ATAC_bin_rm_sliding_mean %>% select(X80:X400)
sum_control_normalized_sliding_trimmed <- sum_control_normalized_sliding_trimmed %>% mutate(ATAC_val = ATAC_value$ATAC_val)
head(sum_control_normalized_sliding_trimmed)
dim(sum_control_normalized_sliding_trimmed)


X80,X81,X82,X83,X84,X85,X86,X87,X88,X89,...,X392,X393,X394,X395,X396,X397,X398,X399,X400,ATAC_val
1.977192e-05,2.040463e-05,2.238182e-05,2.514989e-05,2.823431e-05,3.297957e-05,4.073016e-05,4.768988e-05,5.314693e-05,5.876216e-05,...,6.896447e-05,6.627549e-05,6.366559e-05,5.971121e-05,5.5915e-05,5.275149e-05,4.871802e-05,4.56336e-05,4.270736e-05,0.5931657
3.378396e-05,3.665598e-05,3.937683e-05,4.262674e-05,4.693477e-05,5.275438e-05,6.061463e-05,6.975973e-05,7.777114e-05,8.585812e-05,...,9.182889e-05,8.744529e-05,8.200358e-05,7.595723e-05,7.180037e-05,6.892835e-05,6.58296e-05,6.288201e-05,5.948094e-05,0.4089315
1.73324e-05,1.875698e-05,2.057728e-05,2.168529e-05,2.477188e-05,2.722533e-05,3.244879e-05,3.806797e-05,4.495345e-05,5.03352e-05,...,8.943204e-05,8.610802e-05,8.270485e-05,7.938083e-05,7.653166e-05,7.217878e-05,6.814246e-05,6.489758e-05,6.165271e-05,0.3058228
1.832101e-05,1.986709e-05,2.156777e-05,2.365498e-05,2.651522e-05,2.976198e-05,3.548247e-05,4.305824e-05,4.738726e-05,5.132975e-05,...,7.158336e-05,6.6404e-05,6.246151e-05,6.006509e-05,5.705024e-05,5.303044e-05,5.063402e-05,4.785108e-05,4.661422e-05,0.4764137
1.258397e-05,1.352563e-05,1.438168e-05,1.566576e-05,1.831952e-05,2.191494e-05,2.765049e-05,3.27012e-05,3.715268e-05,4.228899e-05,...,7.901364e-05,7.464778e-05,7.045312e-05,6.797057e-05,6.446075e-05,6.146457e-05,5.949565e-05,5.521538e-05,5.281844e-05,0.2862775
1.427507e-05,1.588677e-05,1.711473e-05,1.864969e-05,2.187309e-05,2.617096e-05,3.054558e-05,3.65319e-05,4.198098e-05,4.558812e-05,...,7.935711e-05,7.475225e-05,7.045437e-05,6.769146e-05,6.439131e-05,6.124465e-05,5.8098e-05,5.502809e-05,5.31094e-05,0.3169988


In [10]:
write.table(sum_control_normalized_sliding_trimmed, "../data/sum_control_ATAC_bin_rm_sliding_mean_trimmed_CONTROLS.csv", append = F, sep=',', col.names=T, row.names=F)

In [15]:
saveRDS(sum_control_normalized_sliding_trimmed, file = "../data/data_rds_format/sum_control_ATAC_bin_rm_sliding_mean_trimmed_CONTROLS.rds")

### 5) Discarding start and end of histograms for all the normalized and rolling mean smoothed individuals

In [None]:
#inputs
filename = "../data/all_samples_normalized_sliding_mean.csv"

In [None]:
small_test <- read_csv(file = filename, col_names = T, comment = "#", skip = 0, n_max = 10)
head(small_test)

In [None]:
cols_trimmed_first_two <- colnames(small_test)[1:2]
cols_trimmed_rest <- colnames(small_test)[53:373]
cols_trimmed_rest <- paste("X", cols_trimmed_rest, sep = "")

cols_trimmed = as.data.frame(t(c(cols_trimmed_first_two, cols_trimmed_rest)))
cols_trimmed

In [None]:
write.table(NULL, file="../data/all_samples_normalized_sliding_mean_trimmed.csv", 
             append = F, 
             sep=',', 
             row.names=F, 
             col.names=F)

write.table(cols_trimmed, "../data/all_samples_normalized_sliding_mean_trimmed.csv", append = F, sep=',', col.names=F, row.names=F)

In [None]:
f <- function(df1, pos) {
    trimmed_first_two <- df1[,1:2]
    trimmed_rest <- df1[,53:373]
    
    output <- cbind(trimmed_first_two, trimmed_rest)

    write.table(output, "../data/all_samples_normalized_sliding_mean_trimmed.csv", append = TRUE, sep=',', col.names=F, row.names=F)
    }


df <- read_delim_chunked(file = filename, 
                 callback = DataFrameCallback$new(f), 
                 chunk_size = 10000, col_names = T,
                 delim = ",", 
                 progress = show_progress())