In [3]:
library(tidyverse)
library(tidyr)
library(scales)
library(LICORS)
library(slider)
library(glmnet)
library(gbm)

Loading required package: Matrix

Attaching package: ‘Matrix’

The following objects are masked from ‘package:tidyr’:

    expand, pack, unpack

Loading required package: foreach

Attaching package: ‘foreach’

The following objects are masked from ‘package:purrr’:

    accumulate, when

Loaded glmnet 2.0-16

Loaded gbm 2.1.8


In [None]:
sum_control_ATAC_bin_rm = readRDS(snakemake@input[["input_train"]])

In [None]:
set.seed(0)

fit <- gbm(formula  = ATAC_val ~ ., 
               data = sum_control_ATAC_bin_rm, 
            n.trees =600, 
           cv.folds = 10)

In [None]:
pd <- tibble(rmse_cv = sqrt(fit$cv.error), 
             rmse_train = sqrt(fit$train.error)) %>%
  mutate(tree = row_number()) %>%
  pivot_longer(names_to = "key", values_to = "value", -tree)

ggplot(pd, aes(x=tree, y=value, color=key)) + 
  geom_line() + 
  geom_point() + 
  NULL

In [None]:
gbm.perf(fit, method = "cv")

In [None]:
fit$train.error[which.min(sqrt(fit$train.error))]

fit$cv.error[which.min(sqrt(fit$cv.error))]


In [None]:
predictions <- fit$cv.fitted
observed <- as.data.frame(sum_control_ATAC_bin_rm$ATAC_val)

In [None]:
combined <- cbind(predictions, observed)
colnames(combined) <- c("predictions", "observed")
head(combined)

In [None]:
p1 <- ggplot(data = combined, aes(x = observed, y = predictions)) + 
    geom_point(size = 0.5) + 
    geom_smooth(method = "lm", formula = y~x)

In [None]:
ggsave(plot = p1, file = snakemake@output[["boosting_plot_summed"]])

In [None]:
summed_corr <- cor(combined$observed, combined$predictions)

### Prediction for all individuals

In [None]:
all_individuals = readRDS(snakemake@input[["input_test"]])

In [None]:
ATAC = read.table(snakemake@input[["ATAC_input"]])
colnames(ATAC) = c("bin", "ATAC_val")
all_individuals_ATAC <- inner_join(all_individuals, ATAC, by ="bin") 

In [None]:
testdata <- all_individuals %>% select(-sample) %>% select(-bin)

In [None]:
y <- sum_control_ATAC_bin_rm %>% dplyr::select(ATAC_val) %>% as.matrix()
tmp <- predict(fit, testdata)
data<- cbind(tmp, y)
colnames(data) <- c("predicted", "observed")

In [None]:
individual_corr <- cor(data$predicted, data$observed)
p2 <- ggplot(data, aes(x = observed, y = predicted)) + 
    geom_point(size = 0.5) + 
    geom_smooth(method = "lm", formula = y~x)
ggsave(plot = p2, file = snakemake@output[["boosting_plot_individual"]])

In [None]:
correlations <- rbind(summed_corr, individual_corr) 
rownames(correlations) <- c("summed controls", "control individually")
write.csv(correlations, snakemake@output[["lasso_ridge_corr"]]