In [2]:
require(data.table)
require(tidyverse)

## 1. Read data 

In [3]:
dat.mat <- fread('../dat/1908/all.distal.long.matrix.csv')
dim(dat.mat)
head(dat.mat)
#dat.mat$cell <- NULL #no need cell id 

distal_peak,barcode,cluster
1_10216_10512,Islet3-fresh_CTGAAGCTTGCAGCTACTCTCTATTATAGCCT,beta_2
1_237657_237858,Islet3-fresh_CTGAAGCTTGCAGCTACTCTCTATTATAGCCT,beta_2
1_967851_968930,Islet3-fresh_CTGAAGCTTGCAGCTACTCTCTATTATAGCCT,beta_2
1_967851_968930,Islet3-fresh_CTGAAGCTTGCAGCTACTCTCTATTATAGCCT,beta_2
1_1004053_1005548,Islet3-fresh_CTGAAGCTTGCAGCTACTCTCTATTATAGCCT,beta_2
1_1004053_1005548,Islet3-fresh_CTGAAGCTTGCAGCTACTCTCTATTATAGCCT,beta_2


 Concepts: 
1. `promoter region`: -500bp + 500bp for all TSS in gencode 
2. `promoter Peaks`: peaks that overlap promoter region

In [4]:
dat.mat <- dat.mat %>% distinct()
dim(dat.mat)


In [6]:
# all cells 
dat.all.subcells <- table((dat.mat[,-"distal_peak"]%>%distinct())$clust)
dat.all.subcells


      alpha_1       alpha_2        beta_1        beta_2       delta_1 
         4266          1328          4354          2816           683 
      delta_2 endothelial_1 endothelial_2      exocrine         gamma 
           35            62            95           131           206 
        glial        immune      stellate 
           39            71           153 

In [7]:
sub('(_1)|(_2)','',names(dat.all.subcells))

In [8]:
sum(dat.all.subcells)

In [9]:
# how many peaks
length(unique(dat.mat$distal_peak))

## 2 Prepare data


Peak|n_cells|celltype 
--- | --- | ---


In [10]:
# prepare data
dat.mat<- dat.mat[,.N,by=.(distal_peak,cluster)]%>%arrange(distal_peak,cluster)

In [11]:
head(dat.mat)

distal_peak,cluster,N
1_100009936_100010354,alpha_1,17
1_100009936_100010354,alpha_2,6
1_100009936_100010354,beta_1,125
1_100009936_100010354,beta_2,97
1_100009936_100010354,delta_1,2
1_100009936_100010354,endothelial_1,4


## 3 Fisher's test one vs other 

In [41]:
tr = "1_100009936_100010354"
celltypes <- "delta_2"
celltypes <- 'alpha_1'
test.dat <- dat.mat %>% filter(distal_peak == tr)
test.dat <- test.dat %>% mutate(cluster = ifelse(cluster == celltypes, celltypes, 
    "other")) %>% group_by(distal_peak, cluster) %>% summarise(N = sum(N))
test.dat
table.res <- test.dat$N
names(table.res) <- test.dat$cluster
# handle if 0 for one subtype
a = setdiff(celltypes, names(table.res))
table.res[a] <- 0
table.res
all_test_cells = as.numeric(dat.all.subcells[celltypes])
other_cells = as.numeric(sum(dat.all.subcells) - dat.all.subcells[celltypes])
test.tab <- matrix(c(table.res[celltypes], table.res["other"], all_test_cells - table.res[celltypes], 
    other_cells - table.res["other"]), byrow = T, nrow = 2, dimnames = list(expressed = c("Yes", 
    "No"), subtype = c(celltypes, "other")))
test.tab
test.tab[1]
test.tab[2]

(f1 <- test.tab[1]/(test.tab[1] + test.tab[2]))
(f2 <- test.tab[3]/(test.tab[3] + test.tab[4]))
(l <- ifelse(f1 > f2, "greater", "less"))
test.res <- fisher.test(test.tab, alternative = l)
test.res
res <- list(pval = test.res$p.value, odds = test.res$estimate, type1_frac = f1, type2_frac = f2)
res

(test.res <- fisher.test(test.tab))

distal_peak,cluster,N
1_100009936_100010354,alpha_1,17
1_100009936_100010354,other,241


Unnamed: 0,alpha_1,other
Yes,17,241
No,4249,9732



	Fisher's Exact Test for Count Data

data:  test.tab
p-value < 2.2e-16
alternative hypothesis: true odds ratio is less than 1
95 percent confidence interval:
 0.0000000 0.2469562
sample estimates:
odds ratio 
  0.161571 



	Fisher's Exact Test for Count Data

data:  test.tab
p-value < 2.2e-16
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.09249277 0.26460818
sample estimates:
odds ratio 
  0.161571 


function

In [40]:
fun.testPerTr_OneVsOther <- function(tr = "1_100009936_100010354", celltypes = "alpha_1", 
    dat = dat.mat) {
    
    test.dat <- dat %>% filter(distal_peak == tr) %>% mutate(cluster = ifelse(cluster == 
        celltypes, celltypes, "other")) %>% group_by(distal_peak, cluster) %>% summarise(N = sum(N))
    table.res <- test.dat$N
    names(table.res) <- test.dat$cluster
    
    # handle if 0 for one subtype
    a = setdiff(celltypes, names(table.res))
    table.res[a] <- 0
    
    all_test_cells = as.numeric(dat.all.subcells[celltypes])
    other_cells = as.numeric(sum(dat.all.subcells) - dat.all.subcells[celltypes])
    test.tab <- matrix(c(table.res[celltypes], table.res["other"], all_test_cells - 
        table.res[celltypes], other_cells - table.res["other"]), byrow = T, nrow = 2, 
        dimnames = list(expressed = c("Yes", "No"), subtype = c(celltypes, "other")))
    f1 <- test.tab[1]/(test.tab[1] + test.tab[2])
    f2 <- test.tab[3]/(test.tab[3] + test.tab[4])
    l <- ifelse(f1 > f2, "greater", "less")
    test.res <- fisher.test(test.tab, alternative = l)
    res <- list(distal_peak = tr, pval = test.res$p.value, odds = test.res$estimate, 
        type1_frac = f1, type2_frac = f2)
    res
}

## 

celltypes <- "alpha_1"

system.time({res =fun.testPerTr_OneVsOther()})
t(res)

   user  system elapsed 
  0.050   0.002   0.054 

distal_peak,pval,odds,type1_frac,type2_frac
1_100009936_100010354,7.098895e-21,0.161571,0.003984998,0.02416525


In [None]:
require(parallel)
celltypes <- list()
celltypes$alpha <- c("alpha_1", "alpha_2")
celltypes$beta <- c("beta_1", "beta_2")
celltypes$delta <- c("delta_1", "delta_2")
res.transcript_level <- list()
# time consumming task
system.time(for (x in c("alpha", "beta", "delta")) {
    
    dat.mat.sub <- dat.mat %>% filter(cluster %in% celltypes[[x]])
    all.tr <- unique(dat.mat.sub$distal_peak)
    
    res.transcript_level[[x]] <- do.call(rbind, mclapply(all.tr, function(trr) t(fun.ftestPerTr(dat = dat.mat.sub, 
        tr = trr, celltypes = celltypes[[x]])), mc.cores = 10))
})

saveRDS(object = res.transcript_level, file = "../dat/1901/res.distal.peaks.fisher.Rds")

## 4. Fisher's exact test  between subtypes

As long as there is any  peaks in that cell, the peak is open in this cell. 

1. get total alpha 1 and alpha 2 cells 
2. test hit in alpha1 vs hit in alpha2  (create contentigen table)
3. perform [Fisher's exact test](https://en.wikipedia.org/wiki/Fisher%27s_exact_test) or [chi-squared test](https://en.wikipedia.org/wiki/Chi-squared_test)

Input: `dat.mat` and `dat.all.cells`
output: 

peak|n_celltype1|n_celltype2|total_cells|celltype | pval | frac_1 | frac_2| odds


### 4.1 perform fisher's exact test for distal peaks

In [33]:
tr = "1_100009936_100010354"
celltypes <- c("alpha_1", "alpha_2")
celltypes <- c("delta_1", "delta_2")

test.dat <- dat.mat %>% filter(distal_peak == tr & cluster %in% celltypes)
test.dat
table.res<- test.dat$N; names(table.res) <- test.dat$cluster
# handle if 0 for one subtype
a = setdiff(celltypes, names(table.res))
table.res[a] <- 0
test.tab <- matrix(c(table.res[celltypes[1]], table.res[celltypes[2]], dat.all.cells[celltypes[1]] - 
    table.res[celltypes[1]], dat.all.cells[celltypes[2]] - table.res[celltypes[2]]), 
    byrow = T, nrow = 2, dimnames = list(expressed = c("Yes", "No"), subtype = celltypes))
test.tab
test.tab[1]
test.tab[2]


f1 <- test.tab[1]/(test.tab[1] + test.tab[2])
f2 <- test.tab[3]/(test.tab[3] + test.tab[4])
l <- ifelse(f1 > f2, "greater", "less")
test.res <- fisher.test(test.tab, alternative = l)
res <- list(pval = test.res$p.value, odds = test.res$estimate, type1_frac = f1, type2_frac = f2)
res

fisher.test(test.tab)

distal_peak,cluster,N
1_100009936_100010354,alpha_1,17


ERROR: Error in matrix(c(table.res[celltypes[1]], table.res[celltypes[2]], dat.all.cells[celltypes[1]] - : object 'dat.all.cells' not found


In [91]:

fun.ftestPerTr <- function(  tr='1_100009936_100010354',#=1
                             celltypes=c('alpha_1','alpha_2'),
                             dat=dat.mat){
    

test.dat <- dat %>% filter(distal_peak == tr & cluster %in% celltypes)
 table.res<- test.dat$N; names(table.res) <- test.dat$cluster

    # handle if 0 for one subtype 
    a=setdiff(celltypes,    names(table.res))
    table.res[a]<-0
    
    test.tab <- matrix(c(table.res[celltypes[1]], table.res[celltypes[2]], 
                         dat.all.cells[celltypes[1]]-table.res[celltypes[1]], 
                         dat.all.cells[celltypes[2]]-table.res[celltypes[2]]),
                       byrow =  T,       
                       nrow = 2,
                       dimnames = list(expressed = c("Yes", "No"),
                       subtype = celltypes))

    f1<-test.tab[1]/(test.tab[1]+test.tab[2])
    f2 <- test.tab[3]/(test.tab[3]+test.tab[4])
    l <-ifelse(f1>f2,'greater','less')
    test.res <- fisher.test(test.tab,alternative = l)
    res <- list(
                distal_peak =tr,
                pval=test.res$p.value,
                odds=test.res$estimate,
                type1_frac=f1,
                type2_frac=f2
               )
    res
}

##

celltypes <- c('alpha_1','alpha_2')

system.time(fun.ftestPerTr(celltypes=celltypes))
t(fun.ftestPerTr(celltypes=celltypes))


   user  system elapsed 
  0.057   0.000   0.058 

distal_peak,pval,odds,type1_frac,type2_frac
1_100009936_100010354,0.4746936,0.881426,0.003978469,0.004511278


In [92]:

celltypes <- c('alpha_1','alpha_2')
dat.mat.sub <- dat.mat%>%filter(cluster %in% celltypes)
fun.ftestPerTr <- function(  tr='1_100009936_100010354',#=1
                             celltypes=c('alpha_1','alpha_2'),
                             dat=dat.mat.sub){
    

test.dat <- dat %>% filter(distal_peak == tr)
 table.res<- test.dat$N; names(table.res) <- test.dat$cluster

    # handle if 0 for one subtype 
    a=setdiff(celltypes,    names(table.res))
    table.res[a]<-0
    
    test.tab <- matrix(c(table.res[celltypes[1]], table.res[celltypes[2]], 
                         dat.all.cells[celltypes[1]]-table.res[celltypes[1]], 
                         dat.all.cells[celltypes[2]]-table.res[celltypes[2]]),
                       byrow =  T,       
                       nrow = 2,
                       dimnames = list(expressed = c("Yes", "No"),
                       subtype = celltypes))

    f1<-test.tab[1]/(test.tab[1]+test.tab[2])
    f2 <- test.tab[3]/(test.tab[3]+test.tab[4])
    l <-ifelse(f1>f2,'greater','less')
    test.res <- fisher.test(test.tab,alternative = l)
    res <- list(
                distal_peak =tr,
                pval=test.res$p.value,
                odds=test.res$estimate,
                type1_frac=f1,
                type2_frac=f2
               )
    res
}

##



system.time(fun.ftestPerTr(celltypes=celltypes))
t(fun.ftestPerTr(celltypes=celltypes))


   user  system elapsed 
  0.028   0.001   0.029 

distal_peak,pval,odds,type1_frac,type2_frac
1_100009936_100010354,0.4746936,0.881426,0.003978469,0.004511278


In [93]:

# estimate time
celltypes <- list()
celltypes$alpha <- c("alpha_1", "alpha_2")
celltypes$beta <- c("beta_1", "beta_2")
celltypes$delta <- c("delta_1", "delta_2")
sum(sapply( c("alpha", "beta", "delta"),function(x) 
    length(unique((dat.mat%>% filter(cluster %in% celltypes[[x]]))$distal_peak))))*.029/3600

In [94]:
## test run 
x<- "beta"
dat.mat.sub <- dat.mat%>%filter(cluster %in% celltypes[[x]])
all.tr <- unique(dat.mat.sub$distal_peak)
length(all.tr)
do.call(rbind,lapply(all.tr[1:10],function(trr) t(fun.ftestPerTr(dat = dat.mat.sub, 
        tr = trr, celltypes = celltypes[[x]]))))

distal_peak,pval,odds,type1_frac,type2_frac
1_100009936_100010354,0.09751913,0.8286919,0.02868288,0.03440937
1_100014517_100015228,0.005376729,1.569892,0.02592933,0.01667258
1_100017588_100018128,1.088472e-07,0.09872459,0.0009178522,0.009223129
1_100023318_100023949,0.000406392,0.2145829,0.001376778,0.006385243
1_100056203_100056908,0.001257952,0.5105029,0.008949059,0.01738205
1_100064607_100064883,0.6072175,inf,0.0002294631,0.0
1_100065165_100065531,0.6313201,0.6468356,0.0002294631,0.0003547357
1_100080955_100081248,0.2238285,inf,0.0006883892,0.0
1_10010384_10010790,0.001089532,2.039122,0.01720973,0.008513657
1_100113722_100114033,0.005456032,1.486748,0.03235429,0.02199361


In [95]:
require(parallel)
celltypes <- list()
celltypes$alpha <- c("alpha_1", "alpha_2")
celltypes$beta <- c("beta_1", "beta_2")
celltypes$delta <- c("delta_1", "delta_2")
res.transcript_level <- list()
# time consumming task
system.time(for (x in c("alpha", "beta", "delta")) {
    
    dat.mat.sub <- dat.mat %>% filter(cluster %in% celltypes[[x]])
    all.tr <- unique(dat.mat.sub$distal_peak)
    
    res.transcript_level[[x]] <- do.call(rbind, mclapply(all.tr, function(trr) t(fun.ftestPerTr(dat = dat.mat.sub, 
        tr = trr, celltypes = celltypes[[x]])), mc.cores = 10))
})

saveRDS(object = res.transcript_level, file = "../dat/1901/res.distal.peaks.fisher.Rds")

    user   system  elapsed 
8810.949  620.087 3677.038 

In [100]:
res.transcript_level <- rbind(data.frame(res.transcript_level$alpha,celltype="alpha"),
     data.frame(res.transcript_level$beta,celltype="beta"),
     data.frame(res.transcript_level$delta,celltype="delta"))


In [104]:
head(res.transcript_level)
dim(res.transcript_level)[1]/3600*.028

distal_peak,pval,odds,type1_frac,type2_frac,celltype
1_100009936_100010354,0.4746936,0.881426,0.003978469,0.004511278,alpha
1_100014517_100015228,0.5323403,1.089823,0.00491458,0.004511278,alpha
1_100017588_100018128,0.001922056,0.1549027,0.0009361105,0.006015038,alpha
1_100023318_100023949,0.0004998282,0.1547046,0.001170138,0.007518797,alpha
1_100056203_100056908,0.476826,1.868512,0.001404166,0.0007518797,alpha
1_100064607_100064883,0.7626272,inf,0.0002340276,0.0,alpha


In [105]:
fwrite(res.transcript_level, file = "../dat/1901/res.distal.peaks.fisher.cvs")