<a href="https://colab.research.google.com/github/almedida/thesis/blob/main/estimate_m0s_10k_simulation_new_method.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
install.packages('pacman')
library(pacman)

p_load("tidyverse", "tmvtnorm" )

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘zoo’, ‘sandwich’, ‘mvtnorm’, ‘gmm’



tmvtnorm installed



##**Load Dataset**##

In [35]:
#load dataset on data_df
data_df = read.table("10k_ttest_limma_pvalue_sim3.csv", header= TRUE, sep=",")

pval_raw = as.data.frame(as.matrix(data_df))
pvals1 <- (as.matrix(pval_raw[,1]))
pvals2 <- (as.matrix(pval_raw[,2]))


In [36]:
length(pvals1)
length(pvals2)

#dim(pvals2)
head(pvals1, 5)
head(pvals2)

0
0.923358
0.265972
0.3822177
0.2254991
0.3411391


0
0.514654518
0.008717797
0.002733117
0.317833927
0.761912116
0.952251137


##**Calculate cut-off points using Histogram-based method**##

calc.cutoff slightly modifies the estimate.m0 function to return the appropriate p-value cutoff where all p-values greater than the cutoff are assumed to come from null cases.

In [37]:
calc.cutoff = function(p, B = 20, max=1){

  m <- length(p)
  m0 <- m
  bin <- c(-0.1, (1:B)/B*max)
  bin.counts=rep(0,B)

  for(i in 1:B){
    bin.counts[i]=sum((p>bin[i])&(p<=bin[i+1]))
  }

  tail.means <- rev(cumsum(rev(bin.counts))/(1:B))
  temp <- bin.counts - tail.means
  index <- min((1:B)[temp <= 0])
  cutoff2 <- (index)/B*max
  if(cutoff2 == 1) {cutoff2 <- 1-1/B}

  return(cutoff2)

}


In [38]:
cutoff_value1 = calc.cutoff(pvals1, B=20, max=1)
cutoff_value2 = calc.cutoff(pvals2, B=20, max=1)

cutoff = cbind(c(cutoff_value1), c(cutoff_value2))

colnames(cutoff) = c("cutoff_value1", "cutoff_value2")

cutoff

cutoff_value1,cutoff_value2
0.65,0.75


**Filter dataset using cutoff points**
* selecting pvalues in the upper-right quadrant using Histogram-Based method
*  defined as lambda1 = cutoffvalue1 and lambda2 = cutoffvalue2 for pvalue 1 and pvalue 2 respectively.
*  lambda1, lambda2 - probability of pvalue 1 and pvalue 2 in the upper-right quadrant respectively 


In [39]:

p_vals = pval_raw  %>% filter(pvalue >=cutoff_value1, limma_pvalue>=cutoff_value2)

dim(p_vals)

head(p_vals, 5)


Unnamed: 0_level_0,pvalue,limma_pvalue
Unnamed: 0_level_1,<dbl>,<dbl>
1,0.6636096,0.7748455
2,0.8494194,0.9860306
3,0.7048638,0.7792063
4,0.793427,0.7742337
5,0.8501948,0.9666501


convert selected pvalues to z values

In [40]:
z_val = as.data.frame(qnorm(as.matrix(p_vals), lower.tail = TRUE))
colnames(z_val) = c("zvals1", "zvals2")

head(z_val)


Unnamed: 0_level_0,zvals1,zvals2
Unnamed: 0_level_1,<dbl>,<dbl>
1,0.4223347,0.7548998
2,1.0339463,2.1981459
3,0.5384414,0.7695153
4,0.8183699,0.7528624
5,1.0372692,1.8336922
6,2.185225,0.8995627


In [41]:
zvals1 <- (as.data.frame(as.matrix(z_val[,1])))
zvals2 <- (as.data.frame(as.matrix(z_val[,2])))

head(zvals1, 5)

Unnamed: 0_level_0,V1
Unnamed: 0_level_1,<dbl>
1,0.4223347
2,1.0339463
3,0.5384414
4,0.8183699
5,1.0372692


convert lambda(truncation points) to z values

In [42]:
z_val_extremums = as.data.frame(qnorm(as.matrix(cbind(c(cutoff_value1,1),c(cutoff_value2,1))), lower.tail = TRUE))

selecting the lower bounds (minimum z values) of the bivariate z values

In [43]:
min_z1 <- z_val_extremums[1,1]
min_z2 <- z_val_extremums[1,2]

In [44]:
head(z_val_extremums, 2)

Unnamed: 0_level_0,V1,V2
Unnamed: 0_level_1,<dbl>,<dbl>
1,0.3853205,0.6744898
2,inf,inf


##**Estimating m00s**##

*  m0.1 - DE genes in experiment 1
*  m0.2 - DE genes in experiment 2
*  m11 - DE genes in both experiments
*  m00 - EE genes in both experiments

In [45]:
estimate.m0s <- function(p1, p2, B=20){
  m <- length(p1)

  ##find lambda cutoffs using histogram-based method
  c1 <- calc.cutoff(p1, B=B, max=1)
  c2 <- calc.cutoff(p2, B=B, max=1)

  ##estimate m0 for experiment 1
  ind1 <- (p1>=c1)
  m0.1 <- sum(ind1)/(1-c1)
  m0.1 <- min(10000)

  ##estimate m0 for experiment 2  
  ind2 <- (p2>=c2)
  m0.2 <- sum(ind2)/(1-c2)
  m0.2 <- min(10000)


  ##estimate m00
  ind12 <- ind1 & ind2
  nA <- sum(ind12)
  #pA <- (1-c1)*(1-c2)
  #m00 <- nA/pA
  
  #here, we used converted pvalues to z values to estimnate m00
  # density function for each row of the bivariate z values (x) and 
  # estimated parameters(rho)
  density = function(x, rho)
  {
    sigma = matrix(c(1, rho, rho, 1), 2, 2)
    z = dtmvnorm(x, mean = c(0,0), sigma = sigma, lower = c(min_z1, min_z2))
  }
  
  # log likelihood of the joint densities
  log_likelihood_fn = function(rho){
    
    joint_likelihood = z_val %>% split(.$zvals2) %>% map_dfr(~density(c(.$zvals1,.$zvals2),rho))    
    return(-sum(log(joint_likelihood)))
    
  }
  
  #MLE of the log likelihood function
  optimal_rho = optimize(log_likelihood_fn, lower = -1, upper = 1 )
  optimal_rho = as.data.frame(optimal_rho)
  
  #probability of a random variable greater than cutoff values 
  rho = as.numeric(optimal_rho[1])
  pA = pmvnorm(lower=c(min_z1, min_z2), upper=c(Inf, Inf), mean=c(0,0), sigma = matrix(c(1, rho, rho, 1), 2, 2))
  
  m00 <- nA/pA
  m00 <- min(m00)
  
  
  ##estimate m11
  m11 <- sum(m - m0.1 - m0.2 + m00)
  if (m0.1 == 10000 || m0.2 == 10000 || m00 == 10000){
    m11 == 0
  }
  
  ret <- list()
  ret$ms <- c(m, m0.1, m0.2, m11, m00)
  names(ret$ms) <- c("m", "m0.1", "m0.2", "m11","m00")
  ret$cutoffs <- c(c1, c2)
  return(ret)
}

In [46]:
estimate.m0s(pvals1, pvals2, B=20)