# Gaussian smoothing

__This notebook calculates the Gaussian smoothed histograms for 1) all summed controls and 2) all individuals. The input file contains the already normalized histograms. The input file is read in in chunks, histograms smoothed and then written into the output file.__

In [4]:
# packages

library(tidyverse)

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.1     ✔ purrr   0.3.2
✔ tibble  3.1.4     ✔ dplyr   1.0.7
✔ tidyr   1.1.3     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.4.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [10]:
#################################################################
gaussian_smoothing <- function(bin, sd_percentage = 25) {
  bin_size <- length(bin)
  if (bin_size %% 2 == 0) {
    stop("`bin` should have an odd number of elements")
  }

  # Create density for normal distribution
  bin_indices <- seq_len(bin_size)
  sd <- bin_size * (sd_percentage / 100)
  weights <- dnorm(
    bin_indices,
    mean = ceiling(bin_size / 2),
    sd = sd
  )

  # Ensure the weights sum to 1 (they should already, I think)
  weights <- weights / sum(weights)

  # Weighted sum
  sum(bin * weights)
}
#################################################################

### 1) Gaussian smoothing of the summed controls

In [None]:
#inputs
bin_size <- 11
signal_size <- 671
filename = "../data/sum_control_normalized_ATAC_bin_rm.csv"

In [4]:
small_test <- read_csv(file = filename, col_names = T, comment = "#", skip = 0, n_max = 20)
head(small_test)

Parsed with column specification:
cols(
  .default = col_double()
)
See spec(...) for full column specifications.


X30,X31,X32,X33,X34,X35,X36,X37,X38,X39,...,X692,X693,X694,X695,X696,X697,X698,X699,X700,ATAC_val
8.699646e-07,3.479859e-06,2.609894e-06,0.0,8.699646e-07,8.699646e-07,1.739929e-06,1.739929e-06,0.0,2.609894e-06,...,0,0,0,0,0,0,0,0,0.0,0.5931657
8.313727e-07,8.313727e-07,0.0,8.313727e-07,1.662745e-06,8.313727e-07,8.313727e-07,5.819609e-06,3.325491e-06,3.325491e-06,...,0,0,0,0,0,0,0,0,0.0,0.4089315
8.705774e-07,1.741155e-06,8.705774e-07,8.705774e-07,0.0,0.0,2.611732e-06,3.482309e-06,0.0,8.705774e-07,...,0,0,0,0,0,0,0,0,8.705774e-07,0.3058228
0.0,1.700685e-06,0.0,8.503423e-07,8.503423e-07,1.700685e-06,8.503423e-07,0.0,8.503423e-07,8.503423e-07,...,0,0,0,0,0,0,0,0,0.0,0.4764137
0.0,9.416577e-07,0.0,0.0,9.416577e-07,0.0,9.416577e-07,0.0,2.824973e-06,0.0,...,0,0,0,0,0,0,0,0,0.0,0.2862775
0.0,0.0,0.0,4.221123e-06,1.688449e-06,1.688449e-06,8.442245e-07,0.0,2.532674e-06,0.0,...,0,0,0,0,0,0,0,0,0.0,0.3169988


In [5]:
cols_smoothed <- colnames(small_test)
cols_smoothed = as.data.frame(t(cols_smoothed))
cols_smoothed

V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V663,V664,V665,V666,V667,V668,V669,V670,V671,V672
X30,X31,X32,X33,X34,X35,X36,X37,X38,X39,...,X692,X693,X694,X695,X696,X697,X698,X699,X700,ATAC_val


In [6]:
write.table(NULL, file="../data/sum_control_normalized_gaussian_smooth_ATAC_bin_rm.csv", 
             append = F, 
             sep=',', 
             row.names=F, 
             col.names=F)

write.table(cols_smoothed, "../data/sum_control_normalized_gaussian_smooth_ATAC_bin_rm.csv", append = F, sep=',', col.names=F, row.names=F)

In [7]:
f <- function(df1, pos) {
    ATAC  <- pull(df1, ATAC_val)
    #message(ATAC)
    df1 <- df1 %>% select(-ATAC_val)
    df1[1:671] <- lapply(df1[1:671], as.numeric)
    output <- matrix(ncol = 671, nrow = 0)

    for (row in 1:nrow(df1)){
        row_cur <- df1[row, ]
        row_cur <- as.vector(t(row_cur))

        smoothed <- rep(0, ceiling(bin_size / 2))
        for (i in seq_len(signal_size - bin_size)) {
          smoothed <- c(smoothed, gaussian_smoothing(row_cur[i:(i+bin_size-1)]))
        }
        smoothed <- c(smoothed, rep(0, floor(bin_size / 2)))
        output <- rbind(output, smoothed)
        }

    output <- as.data.frame(output, index = FALSE)
    output$ATAC_val <- ATAC
    colnames(output) <- NULL
    rownames(output) <- NULL
    
    write.table(output, "../data/sum_control_normalized_gaussian_smooth_ATAC_bin_rm.csv", append = TRUE, sep=',', col.names=F, row.names=F)
    }
    


df <- read_delim_chunked(file = filename, 
                 callback = DataFrameCallback$new(f), 
                 chunk_size = 10000, col_names = T,
                 delim = ",", 
                 progress = show_progress())

Parsed with column specification:
cols(
  .default = col_double()
)
See spec(...) for full column specifications.


### 2) Gaussian smoothing of all individuals

In [7]:
#inputs
bin_size <- 11
signal_size <- 671
filename = "../data/all_samples_normalized.csv"

In [4]:
small_test <- read_csv(file = filename, col_names = T, comment = "#", skip = 0, n_max = 10)
head(small_test)

Parsed with column specification:
cols(
  .default = col_double(),
  sample = col_character(),
  bin = col_character()
)
See spec(...) for full column specifications.


sample,bin,X30,X31,X32,X33,X34,X35,X36,X37,...,X691,X692,X693,X694,X695,X696,X697,X698,X699,X700
PGDX10344P1,chr10_400,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_40,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_401,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_402,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_403,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
cols_smoothed = as.data.frame(t(c(colnames(small_test))))
cols_smoothed

V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V664,V665,V666,V667,V668,V669,V670,V671,V672,V673
sample,bin,X30,X31,X32,X33,X34,X35,X36,X37,...,X691,X692,X693,X694,X695,X696,X697,X698,X699,X700


In [6]:
write.table(NULL, file="../data/all_samples_normalized_gaussian_smooth.csv", 
             append = F, 
             sep=',', 
             row.names=F, 
             col.names=F)

write.table(cols_smoothed, "../data/all_samples_normalized_gaussian_smooth.csv", append = F, sep=',', col.names=F, row.names=F)

In [None]:
f2 <- function(df1, pos) {
    df1[3:673] <- lapply(df1[3:673], as.numeric)
    df <- df1 %>% select(-c("sample", "bin")) %>% as.matrix()
    output <- matrix(ncol = 671, nrow = 0)

    for (row in 1:nrow(df)){
        row_cur <- df[row, ]
        row_cur <- as.vector(t(row_cur))

        smoothed <- rep(0, ceiling(bin_size / 2))
        for (i in seq_len(signal_size - bin_size)) {
          smoothed <- c(smoothed, gaussian_smoothing(row_cur[i:(i+bin_size-1)]))
        }
        smoothed <- c(smoothed, rep(0, floor(bin_size / 2)))
        output <- rbind(output, smoothed)
        }

    output <- data.frame(df1$sample, df1$bin, output)

    write.table(output, "../data/all_samples_normalized_gaussian_smooth.csv", append = TRUE, sep=',', col.names=F, row.names=F)
    }


df <- read_delim_chunked(file = filename, 
                 callback = DataFrameCallback$new(f2), 
                 chunk_size = 10000, col_names = T,
                 delim = ",", 
                 progress = show_progress())

Parsed with column specification:
cols(
  .default = col_double(),
  sample = col_character(),
  bin = col_character()
)
See spec(...) for full column specifications.


##### The smoothing  of all individuals stopped with an error before it could finish - we try to fix it here

In [7]:
filename = "../data/all_samples_normalized.csv"
small_test <- read_csv(file = filename, col_names = F, comment = "#", skip = 0, n_max = 10)
head(small_test)

Parsed with column specification:
cols(
  .default = col_character()
)
See spec(...) for full column specifications.


X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X664,X665,X666,X667,X668,X669,X670,X671,X672,X673
sample,bin,X30,X31,X32,X33,X34,X35,X36,X37,...,X691,X692,X693,X694,X695,X696,X697,X698,X699,X700
PGDX10344P1,chr10_400,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_40,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_401,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGDX10344P1,chr10_402,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
bin_size <- 11
signal_size <- 671
filename = "../data/all_samples_normalized_tail_copy.csv"

In [12]:
f3 <- function(df1, pos) {
    df1[3:673] <- lapply(df1[3:673], as.numeric)
    df <- df1[3:673]
    #df <- df1 %>% select(-c("sample", "bin")) %>% as.matrix()
    output <- matrix(ncol = 671, nrow = dim(df1)[1])

    for (row in 1:nrow(df)){
        row_cur <- df[row, ]
        row_cur <- as.vector(t(row_cur))

        smoothed <- rep(0, ceiling(bin_size / 2))
        for (i in seq_len(signal_size - bin_size)) {
          smoothed <- c(smoothed, gaussian_smoothing(row_cur[i:(i+bin_size-1)]))
        }
        smoothed <- c(smoothed, rep(0, floor(bin_size / 2)))
        output[row, ] <- smoothed
        }

    output <- data.frame(df1[1], df1[2], output)
    write.table(output, "../data/all_samples_normalized_gaussian_smooth.csv", append = TRUE, sep=',', col.names=F, row.names=F)
    }


df <- read_delim_chunked(file = filename, 
                 callback = DataFrameCallback$new(f3), 
                 chunk_size = 10000, col_names = F,
                 delim = ",", 
                 progress = show_progress())

Parsed with column specification:
cols(
  .default = col_double(),
  X1 = col_character(),
  X2 = col_character()
)
See spec(...) for full column specifications.
