In [1]:
library(tidyverse)

── Attaching packages ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.3.5     ✔ purrr   0.3.4
✔ tibble  3.1.5     ✔ dplyr   1.0.7
✔ tidyr   1.1.4     ✔ stringr 1.4.0
✔ readr   2.0.2     ✔ forcats 0.5.1
── Conflicts ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


### Read in the methylation file

In [2]:
met_df <- read.table(file = '../data/methylation/GM12878_WGBS_CpG_mean_hg19.bedgraph', sep = '\t', header = FALSE)
head(met_df)

V1,V2,V3,V4
chr1,10468,10469,75.0
chr1,10469,10470,66.6667
chr1,10470,10471,83.3333
chr1,10471,10472,100.0
chr1,10483,10485,100.0
chr1,10488,10489,75.0


In [3]:
colnames(met_df) <- c("chr", "start", "end", "methylation")
met_df$chr <- as.character(met_df$chr)
head(met_df)
dim(met_df)

chr,start,end,methylation
chr1,10468,10469,75.0
chr1,10469,10470,66.6667
chr1,10470,10471,83.3333
chr1,10471,10472,100.0
chr1,10483,10485,100.0
chr1,10488,10489,75.0


In [4]:
met_df <- met_df %>% arrange(chr, start)

### Read in the file with the ATAC values

In [5]:
sum_control_ATAC_bin_rm = read.csv("../data/sum_control_normalized_ATAC.csv")
ATAC <- sum_control_ATAC_bin_rm %>% select(bin, ATAC_val)

ATAC <- ATAC %>% mutate(bin_id = ATAC$bin)
str(ATAC)
head(ATAC)

'data.frame':	10303 obs. of  3 variables:
 $ bin     : Factor w/ 10303 levels "chr1_100","chr1_101",..: 851 849 850 852 853 854 855 856 857 858 ...
 $ ATAC_val: num  0.593 0.409 0.306 0.476 0.286 ...
 $ bin_id  : Factor w/ 10303 levels "chr1_100","chr1_101",..: 851 849 850 852 853 854 855 856 857 858 ...


bin,ATAC_val,bin_id
chr10_100,0.5931657,chr10_100
chr10_1,0.4089315,chr10_1
chr10_10,0.3058228,chr10_10
chr10_101,0.4764137,chr10_101
chr10_102,0.2862775,chr10_102
chr10_103,0.3169988,chr10_103


In [6]:
#ATAC$ATAC_val <- as.character(ATAC$ATAC_val)
#ATAC$ATAC_val <- as.numeric(ATAC$ATAC_val)
ATAC$bin <- as.character(ATAC$bin)

In [7]:
str(ATAC)
dim(ATAC)

'data.frame':	10303 obs. of  3 variables:
 $ bin     : chr  "chr10_100" "chr10_1" "chr10_10" "chr10_101" ...
 $ ATAC_val: num  0.593 0.409 0.306 0.476 0.286 ...
 $ bin_id  : Factor w/ 10303 levels "chr1_100","chr1_101",..: 851 849 850 852 853 854 855 856 857 858 ...


In [8]:
#ATAC$bin <- sub('...', '', ATAC$bin)
ATAC <- ATAC %>% separate(bin, c("chr", "bin_number"), sep = "_")

ATAC$chr <- as.character(ATAC$chr)
ATAC$bin_number <- as.numeric(ATAC$bin_number)
ATAC <- ATAC %>% mutate(start_pos = bin_number*250000)
ATAC <- ATAC %>% mutate(end_pos = start_pos + 250000)
ATAC <- ATAC %>% arrange(chr, start_pos)

ATAC <- ATAC %>% select(chr, start_pos, end_pos, bin_number, bin_id, ATAC_val)
head(ATAC)

chr,start_pos,end_pos,bin_number,bin_id,ATAC_val
chr1,750000,1000000,3,chr1_3,0.5496224
chr1,1000000,1250000,4,chr1_4,0.5760086
chr1,1250000,1500000,5,chr1_5,0.7151874
chr1,1750000,2000000,7,chr1_7,0.4912511
chr1,2000000,2250000,8,chr1_8,0.456139
chr1,2250000,2500000,9,chr1_9,0.6980095


In [9]:
dim(ATAC)

In [10]:
ATAC <- ATAC %>% filter(!is.na(ATAC_val))

In [11]:
dim(ATAC)

In [12]:
FINAL_meth_df <- tibble(chr = character(), 
                        start = numeric(), 
                        end = numeric(),
                        bin_number = numeric(),
                        methylation = numeric(), 
                        n_meth_values_in_bin = numeric(), 
                        n_na_meth_values_in_bin = numeric())
FINAL_meth_df
str(FINAL_meth_df)

chr,start,end,bin_number,methylation,n_meth_values_in_bin,n_na_meth_values_in_bin


tibble [0 × 7] (S3: tbl_df/tbl/data.frame)
 $ chr                    : chr(0) 
 $ start                  : num(0) 
 $ end                    : num(0) 
 $ bin_number             : num(0) 
 $ methylation            : num(0) 
 $ n_meth_values_in_bin   : num(0) 
 $ n_na_meth_values_in_bin: num(0) 


In [13]:
n_iter <- dim(ATAC)[1] # Number of iterations of the loop

# Initializes the progress bar
pb <- txtProgressBar(min = 0,      # Minimum value of the progress bar
                     max = n_iter, # Maximum value of the progress bar
                     style = 3,    # Progress bar style (also available style = 1 and style = 2)
                     width = 50,   # Progress bar width. Defaults to getOption("width")
                     char = "=")   # Character used to create the bar

mean_meth_vector <- c()
for (row_index in (1:n_iter)){
    chrom <- ATAC[row_index, "chr"]
    start_pos <- ATAC[row_index, "start_pos"]
    end_pos <- ATAC[row_index, "end_pos"]
    bin_nr <- ATAC[row_index, "bin_number"]
    meth_values <- met_df %>% filter(chr == chrom, between(start, start_pos, end_pos - 1))
    
    n_meth_values = dim(meth_values)[1]
    n_na_meth_values = sum(is.na(meth_values$methylation))
    
    if (dim(meth_values)[1] == 0){
        print("zero values found")
        print(chrom)
        print(start_pos)
        print(end_pos)
    }
    
    mean_meth <- mean(meth_values$methylation)
    
    FINAL_meth_df <- FINAL_meth_df %>% add_row(chr = chrom, 
                                               start = start_pos, 
                                               end = end_pos, 
                                               bin_number = bin_nr,
                                               methylation = mean_meth, 
                                               n_meth_values_in_bin = n_meth_values, 
                                               n_na_meth_values_in_bin = n_na_meth_values
                                               )

    setTxtProgressBar(pb, row_index)
                                     
}
close(pb)



In [14]:
head(FINAL_meth_df)
dim(FINAL_meth_df)

chr,start,end,bin_number,methylation,n_meth_values_in_bin,n_na_meth_values_in_bin
chr1,750000,1000000,3,50.85448,12359,0
chr1,1000000,1250000,4,59.94958,15136,0
chr1,1250000,1500000,5,66.37292,13548,0
chr1,1750000,2000000,7,54.63748,12281,0
chr1,2000000,2250000,8,67.25908,13879,0
chr1,2250000,2500000,9,52.50686,13306,0


In [16]:
saveRDS(FINAL_meth_df, file = '../data/methylation/Methylation_files_with_counts/mean_methylation_with_counts.rds')

### Removing NA's

In [17]:
new_meth_df <- readRDS('../data/methylation/Methylation_files_with_counts/mean_methylation_with_counts.rds')
head(new_meth_df)
dim(new_meth_df)

chr,start,end,bin_number,methylation,n_meth_values_in_bin,n_na_meth_values_in_bin
chr1,750000,1000000,3,50.85448,12359,0
chr1,1000000,1250000,4,59.94958,15136,0
chr1,1250000,1500000,5,66.37292,13548,0
chr1,1750000,2000000,7,54.63748,12281,0
chr1,2000000,2250000,8,67.25908,13879,0
chr1,2250000,2500000,9,52.50686,13306,0


In [18]:
sum(is.na(new_meth_df))

In [19]:
new_meth_df_na_rm <- new_meth_df %>% filter(!is.na(methylation))
dim(new_meth_df_na_rm)

In [20]:
saveRDS(new_meth_df_na_rm, file = '../data/methylation/Methylation_files_with_counts/mean_methylation_with_counts_NAs_removed.rds')

### Converting to bin id's

In [21]:
new_meth_df_na_rm$bin_id <- paste(new_meth_df_na_rm$chr, new_meth_df_na_rm$bin_number, sep = "_")
head(new_meth_df_na_rm)

chr,start,end,bin_number,methylation,n_meth_values_in_bin,n_na_meth_values_in_bin,bin_id
chr1,750000,1000000,3,50.85448,12359,0,chr1_3
chr1,1000000,1250000,4,59.94958,15136,0,chr1_4
chr1,1250000,1500000,5,66.37292,13548,0,chr1_5
chr1,1750000,2000000,7,54.63748,12281,0,chr1_7
chr1,2000000,2250000,8,67.25908,13879,0,chr1_8
chr1,2250000,2500000,9,52.50686,13306,0,chr1_9


In [22]:
dim(new_meth_df_na_rm)
new_meth_df_na_rm <- new_meth_df_na_rm %>% select(chr, start, end, bin_number, bin_id, methylation, n_meth_values_in_bin, n_na_meth_values_in_bin)
head(new_meth_df_na_rm)

chr,start,end,bin_number,bin_id,methylation,n_meth_values_in_bin,n_na_meth_values_in_bin
chr1,750000,1000000,3,chr1_3,50.85448,12359,0
chr1,1000000,1250000,4,chr1_4,59.94958,15136,0
chr1,1250000,1500000,5,chr1_5,66.37292,13548,0
chr1,1750000,2000000,7,chr1_7,54.63748,12281,0
chr1,2000000,2250000,8,chr1_8,67.25908,13879,0
chr1,2250000,2500000,9,chr1_9,52.50686,13306,0


In [23]:
saveRDS(new_meth_df_na_rm, file = '../data/methylation/Methylation_files_with_counts/mean_methylation_with_counts_NAs_removed_with_binID.rds')