<a href="https://colab.research.google.com/github/almedida/thesis/blob/main/estimate_m0s_10k_simulation_orr_method.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
install.packages('pacman')
library(pacman)

p_load("tidyverse", "tmvtnorm" )

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



##**Load Dataset**##

In [21]:
#load dataset on data_df
data_df = read.table("ttest_limma_pvalue_10k_sorted.csv", header= TRUE, sep=",")

pval_raw = as.data.frame(as.matrix(data_df))
pvals1 <- (as.matrix(pval_raw[,1]))
pvals2 <- (as.matrix(pval_raw[,2]))


In [22]:
length(pvals1)
length(pvals2)

#dim(pvals2)
head(pvals1, 5)

0
0.031585206
0.000428599
0.057169023
0.310835844
0.018508069


##**Calculate cut-off points using Histogram-based method**##

calc.cutoff slightly modifies the estimate.m0 function to return the appropriate p-value cutoff where all p-values greater than the cutoff are assumed to come from null cases.

In [23]:
calc.cutoff = function(p, B = 20, max=1){

  m <- length(p)
  m0 <- m
  bin <- c(-0.1, (1:B)/B*max)
  bin.counts=rep(0,B)

  for(i in 1:B){
    bin.counts[i]=sum((p>bin[i])&(p<=bin[i+1]))
  }

  tail.means <- rev(cumsum(rev(bin.counts))/(1:B))
  temp <- bin.counts - tail.means
  index <- min((1:B)[temp <= 0])
  cutoff2 <- (index)/B*max
  if(cutoff2 == 1) {cutoff2 <- 1-1/B}

  return(cutoff2)

}


In [24]:
cutoff_value1 = calc.cutoff(pvals1, B=20, max=1)
cutoff_value2 = calc.cutoff(pvals2, B=20, max=1)

cutoff = cbind(c(cutoff_value1), c(cutoff_value2))

colnames(cutoff) = c("cutoff_value1", "cutoff_value2")

cutoff

cutoff_value1,cutoff_value2
0.3,0.15


**Filter dataset using cutoff points**
* selecting pvalues in the upper-right quadrant using Histogram-Based method
*  defined as lambda1 = cutoffvalue1 and lambda2 = cutoffvalue2 for pvalue 1 and pvalue 2 respectively.
*  lambda1, lambda2 - probability of pvalue 1 and pvalue 2 in the upper-right quadrant respectively 


In [25]:
p_vals = pval_raw  %>% filter(pvalue >=cutoff_value1, limma_pval>=cutoff_value2)

dim(p_vals)

head(p_vals, 5)


Unnamed: 0_level_0,pvalue,limma_pval
Unnamed: 0_level_1,<dbl>,<dbl>
1,0.3108358,0.416021
2,0.5207212,0.5490873
3,0.8153381,0.8197118
4,0.6809475,0.6550252
5,0.6465759,0.6798327


##**Estimating m00s**##

*  m0.1 - DE genes in experiment 1
*  m0.2 - DE genes in experiment 2
*  m11 - DE genes in both experiments
*  m00 - EE genes in both experiments

In [38]:
estimate.m0s <- function(p1, p2, B=20){
  m <- length(p1)

  ##find lambda cutoffs using histogram-based method
  c1 <- calc.cutoff(p1, B=B, max=1)
  c2 <- calc.cutoff(p2, B=B, max=1)

  ##estimate m0 for experiment 1
  ind1 <- (p1>=c1)
  m0.1 <- sum(ind1)/(1-c1)

  if (m0.1 > 10000){
    m0.1 = 10000
  }

  ##estimate m0 for experiment 2  
  ind2 <- (p2>=c2)
  m0.2 <- sum(ind2)/(1-c2)

  if (m0.2 > 10000){
    m0.2 = 10000
  }

  ##estimate m00
  ind12 <- ind1 & ind2
  nA <- sum(ind12)
  pA <- (1-c1)*(1-c2)
  m00 <- nA/pA
  
  if (m00 > 10000){
    m00 = 10000
  }


  ##estimate m11
  m11 <- sum(m - m0.1 - m0.2 + m00)
  if (m0.1 == 10000 || m0.2 == 10000 || m00 == 10000){
    m11 == 0
  }
  
  ret <- list()
  ret$ms <- c(m, m0.1, m0.2, m11, m00)
  names(ret$ms) <- c("m", "m0.1", "m0.2", "m11","m00")
  ret$cutoffs <- c(c1, c2)
  return(ret)
}

In [39]:
estimate.m0s(pvals1, pvals2, B=20)