## Calculate correlation

In [2]:
library(tidyverse)

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.1     ✔ purrr   0.3.2
✔ tibble  3.1.4     ✔ dplyr   1.0.7
✔ tidyr   1.1.3     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.4.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


### Observed ATAC values

In [3]:
ATAC_obs <- read.csv("../data/sum_control_normalized_ATAC.csv") %>% select(bin, ATAC_val)
colnames(ATAC_obs) <- c("bin", "ATAC_observed")
head(ATAC_obs)

ATAC_obs <- ATAC_obs %>% mutate_at(c("ATAC_observed"), ~(scale(.) %>% as.vector))
head(ATAC_obs)


sample_types <- read.table("../data/sample_types.txt", header = F, sep = " ")
colnames(sample_types) <- c("sample", "sample_type")
sample_types_healthy <- sample_types %>% filter(sample_type == "Healthy")
head(sample_types_healthy)

bin,ATAC_observed
chr10_100,0.5931657
chr10_1,0.4089315
chr10_10,0.3058228
chr10_101,0.4764137
chr10_102,0.2862775
chr10_103,0.3169988


bin,ATAC_observed
chr10_100,1.2882623
chr10_1,-0.1842358
chr10_10,-1.0083355
chr10_101,0.3551181
chr10_102,-1.1645524
chr10_103,-0.9190113


sample,sample_type
PGDX16568P,Healthy
PGDX16569P,Healthy
PGDX16570P,Healthy
PGDX16571P,Healthy
PGDX16579P,Healthy
PGDX16580P,Healthy


In [4]:
format_input <- function(input_pred){
    formatted_pred <- input_pred %>% pivot_longer(!sample, values_to = "ATAC_prediction", names_to = "bin")
    formatted_pred <- merge(formatted_pred, ATAC_obs, by = "bin")
    
    return(formatted_pred)
}

get_controls <- function(input_pred){
    input_pred_healthy <- merge(input_pred, sample_types_healthy, by = "sample")
    return(input_pred_healthy)
}

# Models trained on summed controls

## Gaussian smoothed individuals, lasso std = TRUE

In [27]:
lasso_pred <- read.csv("../data/ATAC_predictions_on_all_samples_NEW/ATAC_pred_lasso_std_TRUE_gaussian_formatted_standardized.csv")
lasso_pred_formatted <- format_input(lasso_pred)
lasso_pred_formatted_controls <- get_controls(lasso_pred_formatted)
head(lasso_pred_formatted)
dim(lasso_pred_formatted)

head(lasso_pred_formatted_controls)
dim(lasso_pred_formatted_controls)

bin,sample,ATAC_prediction,ATAC_observed
chr1_100,PGDX10351P1,1.1665348,1.738932
chr1_100,PGDX17971P1,1.5099425,1.738932
chr1_100,PGDX10570P,0.2511939,1.738932
chr1_100,PGDX18447P,0.1027906,1.738932
chr1_100,PGDX10577P,1.3222183,1.738932
chr1_100,PGDX18453P,0.4123033,1.738932


sample,bin,ATAC_prediction,ATAC_observed,sample_type
PGDX16568P,chr18_6,0.6226771,-1.4943833,Healthy
PGDX16568P,chr11_167,-0.3729613,-1.513523,Healthy
PGDX16568P,chr3_771,-0.3340284,-0.5790788,Healthy
PGDX16568P,chr18_32,0.6909383,-0.998387,Healthy
PGDX16568P,chr6_532,1.1472074,0.4201405,Healthy
PGDX16568P,chr18_238,-0.1833294,0.5271027,Healthy


In [28]:
cor_all = cor(lasso_pred_formatted$ATAC_observed, lasso_pred_formatted$ATAC_prediction)
cor_control  = cor(lasso_pred_formatted_controls$ATAC_observed, lasso_pred_formatted_controls$ATAC_prediction)

paste("Correlation for all predictions: ", cor_all, sep = "")
paste("Correlation for controls: ", cor_control, sep = "")

## Gaussian smoothed individuals, lasso std = FALSE

In [29]:
lasso_pred <- read.csv("../data/ATAC_predictions_on_all_samples_NEW/ATAC_pred_lasso_std_FALSE_gaussian_formatted_standardized.csv")
lasso_pred_formatted <- format_input(lasso_pred)
lasso_pred_formatted_controls <- get_controls(lasso_pred_formatted)

In [30]:
cor_all = cor(lasso_pred_formatted$ATAC_observed, lasso_pred_formatted$ATAC_prediction)
cor_control  = cor(lasso_pred_formatted_controls$ATAC_observed, lasso_pred_formatted_controls$ATAC_prediction)

paste("Correlation for all predictions: ", cor_all, sep = "")
paste("Correlation for controls: ", cor_control, sep = "")

## Not-smoothed (only normalized) individuals, lasso std = TRUE

In [31]:
lasso_pred <- read.csv("../data/ATAC_predictions_on_all_samples_NEW/ATAC_pred_lasso_std_TRUE_formatted_standardized.csv")
lasso_pred_formatted <- format_input(lasso_pred)
lasso_pred_formatted_controls <- get_controls(lasso_pred_formatted)

In [32]:
cor_all = cor(lasso_pred_formatted$ATAC_observed, lasso_pred_formatted$ATAC_prediction)
cor_control  = cor(lasso_pred_formatted_controls$ATAC_observed, lasso_pred_formatted_controls$ATAC_prediction)

paste("Correlation for all predictions: ", cor_all, sep = "")
paste("Correlation for controls: ", cor_control, sep = "")

## Gaussian smoothed and trimmed individuals, lasso std = TRUE

In [33]:
lasso_pred <- read.csv("../data/ATAC_predictions_on_all_samples_NEW/ATAC_pred_lasso_std_TRUE_gaussian_trimmed_formatted_standardized.csv")
lasso_pred_formatted <- format_input(lasso_pred)
lasso_pred_formatted_controls <- get_controls(lasso_pred_formatted)

In [34]:
cor_all = cor(lasso_pred_formatted$ATAC_observed, lasso_pred_formatted$ATAC_prediction)
cor_control  = cor(lasso_pred_formatted_controls$ATAC_observed, lasso_pred_formatted_controls$ATAC_prediction)

paste("Correlation for all predictions: ", cor_all, sep = "")
paste("Correlation for controls: ", cor_control, sep = "")

# Models trained on individual controls

## Gaussian smoothed individuals, lasso std = TRUE

In [5]:
lasso_pred <- read.csv("../data/all_samples_train_ATAC_predictions_on_all_samples/all_samples_ATAC_pred_lasso_std_TRUE_gaussian_formatted_standardized.csv")
lasso_pred_formatted <- format_input(lasso_pred)
lasso_pred_formatted_controls <- get_controls(lasso_pred_formatted)

In [6]:
cor_all = cor(lasso_pred_formatted$ATAC_observed, lasso_pred_formatted$ATAC_prediction)
cor_control  = cor(lasso_pred_formatted_controls$ATAC_observed, lasso_pred_formatted_controls$ATAC_prediction)

paste("Correlation for all predictions: ", cor_all, sep = "")
paste("Correlation for controls: ", cor_control, sep = "")

## Gaussian smoothed individuals, lasso std = FALSE

In [7]:
lasso_pred <- read.csv("../data/all_samples_train_ATAC_predictions_on_all_samples/all_samples_ATAC_pred_lasso_std_FALSE_gaussian_formatted_standardized.csv")
lasso_pred_formatted <- format_input(lasso_pred)
lasso_pred_formatted_controls <- get_controls(lasso_pred_formatted)

In [8]:
cor_all = cor(lasso_pred_formatted$ATAC_observed, lasso_pred_formatted$ATAC_prediction)
cor_control  = cor(lasso_pred_formatted_controls$ATAC_observed, lasso_pred_formatted_controls$ATAC_prediction)

paste("Correlation for all predictions: ", cor_all, sep = "")
paste("Correlation for controls: ", cor_control, sep = "")

## Not-smoothed (only normalized) individuals, lasso std = TRUE

In [35]:
lasso_pred <- read.csv("../data/all_samples_train_ATAC_predictions_on_all_samples/all_samples_ATAC_pred_lasso_std_TRUE_formatted_standardized.csv")
lasso_pred_formatted <- format_input(lasso_pred)
lasso_pred_formatted_controls <- get_controls(lasso_pred_formatted)

In [36]:
cor_all = cor(lasso_pred_formatted$ATAC_observed, lasso_pred_formatted$ATAC_prediction)
cor_control  = cor(lasso_pred_formatted_controls$ATAC_observed, lasso_pred_formatted_controls$ATAC_prediction)

paste("Correlation for all predictions: ", cor_all, sep = "")
paste("Correlation for controls: ", cor_control, sep = "")

## Gaussian smoothed and trimmed individuals, lasso std = TRUE

In [37]:
lasso_pred <- read.csv("../data/all_samples_train_ATAC_predictions_on_all_samples/all_samples_ATAC_pred_lasso_std_TRUE_gaussian_trimmed_formatted_standardized.csv")
lasso_pred_formatted <- format_input(lasso_pred)
lasso_pred_formatted_controls <- get_controls(lasso_pred_formatted)

In [38]:
cor_all = cor(lasso_pred_formatted$ATAC_observed, lasso_pred_formatted$ATAC_prediction)
cor_control  = cor(lasso_pred_formatted_controls$ATAC_observed, lasso_pred_formatted_controls$ATAC_prediction)

paste("Correlation for all predictions: ", cor_all, sep = "")
paste("Correlation for controls: ", cor_control, sep = "")