In [3]:
require(data.table)
require(tidyverse)

## 1. Read data 

In [3]:
dat.mat <- fread("gzcat ../dat/1910_v2//long_matrix_w_transcripts.txt.gz") %>% filter(is.na(tr.idx))
dim(dat.mat)
head(dat.mat)


seq,start,end,cell,value,tr.idx,gene
<chr>,<int>,<int>,<chr>,<int>,<int>,<chr>
chr10,100005281,100005858,Islet1fresh_AGACACCTCGATCAGTACTGCATAGGCTC,2,,
chr10,100005281,100005858,Islet1fresh_AGACACCTCTCTCTACGTAAGGAGAGGCG,2,,
chr10,100005281,100005858,Islet1fresh_AGACACCTCTCTCTACGTAAGGAGGGCTC,2,,
chr10,100005281,100005858,Islet1fresh_AGCGATAGTCCTGAGCTTATGCGAGTACT,1,,
chr10,100005281,100005858,Islet1fresh_ATTACTCGATCTCAGGAAGGAGTAGTACT,2,,
chr10,100005281,100005858,Islet1fresh_ATTACTCGCTCTGGTAGTAAGGAGCCTAT,1,,


 Concepts: 
1. `promoter region`: -500bp + 500bp for all TSS in gencode 
2. `promoter Peaks`: peaks that overlap promoter region

In [4]:
dat.mat <- dat.mat %>% mutate(seq = sub("chr", "", seq)) %>% mutate(distal_peak = paste0(seq, 
    ":", start, "-", end)) %>% select(distal_peak, cell) %>% distinct()
dim(dat.mat)

In [5]:
dat.mat %>% head(1)
dat.mat <- dat.mat %>% left_join(fread("../dat/1910_v2/islet.cluster_labels.filt.txt") %>% 
    select(index, cluster_name), by = c(cell = "index"))
dat.mat %>% head(1)
dat.mat %>% dim

distal_peak,cell
<chr>,<chr>
10:100005281-100005858,Islet1fresh_AGACACCTCGATCAGTACTGCATAGGCTC


distal_peak,cell,cluster_name
<chr>,<chr>,<chr>
10:100005281-100005858,Islet1fresh_AGACACCTCGATCAGTACTGCATAGGCTC,alpha_1


In [6]:
# get all cells 
dat.all.cells <- table((fread('../dat/1910_v2/islet.cluster_labels.filt.txt'))$cluster) # table for all cells
dat.all.cells
sum(dat.all.cells)



     acinar     alpha_1     alpha_2      beta_1      beta_2     delta_1 
         46        5191        1027        4204        3394         478 
    delta_2      ductal endothelial       gamma      immune    stellate 
        232          80         118         260         140         128 

In [7]:
sub('(_1)|(_2)','',names(dat.all.cells))

In [8]:
sum(dat.all.cells)

In [9]:
# how many peaks
length(unique(dat.mat$distal_peak))

In [10]:
dat.mat <- dat.mat%>%rename(cluster=cluster_name)

## 2 Prepare data


Peak|n_cells|celltype 
--- | --- | ---


In [11]:
# prepare data
setDT(dat.mat)
dat.mat<- dat.mat[,.N,by=.(distal_peak,cluster)]%>%arrange(distal_peak,cluster)

In [12]:
head(dat.mat,1)

distal_peak,cluster,N
<chr>,<chr>,<int>
1:100009712-100010377,alpha_1,28


## 3. Fisher's exact test  between subtypes

As long as there is any  peaks in that cell, the peak is open in this cell. 

1. get total alpha 1 and alpha 2 cells 
2. test hit in alpha1 vs hit in alpha2  (create contentigen table)
3. perform [Fisher's exact test](https://en.wikipedia.org/wiki/Fisher%27s_exact_test) or [chi-squared test](https://en.wikipedia.org/wiki/Chi-squared_test)

Input: `dat.mat` and `dat.all.cells`
output: 

peak|n_celltype1|n_celltype2|total_cells|celltype | pval | frac_1 | frac_2| odds


### 3.1 perform fisher's exact test for distal peaks

In [13]:
tr = "1:100009712-100010377"
celltypes <- c("alpha_1", "alpha_2")

test.dat <- dat.mat %>% filter(distal_peak == tr & cluster %in% celltypes)
test.dat
table.res<- test.dat$N; names(table.res) <- test.dat$cluster
# handle if 0 for one subtype
a = setdiff(celltypes, names(table.res))
table.res[a] <- 0
test.tab <- matrix(c(table.res[celltypes[1]], table.res[celltypes[2]], dat.all.cells[celltypes[1]] - 
    table.res[celltypes[1]], dat.all.cells[celltypes[2]] - table.res[celltypes[2]]), 
    byrow = T, nrow = 2, dimnames = list(expressed = c("Yes", "No"), subtype = celltypes))
test.tab
test.tab[1]
test.tab[2]


f1 <- test.tab[1]/(test.tab[1] + test.tab[2])
f2 <- test.tab[3]/(test.tab[3] + test.tab[4])
l <- ifelse(f1 > f2, "greater", "less")
test.res <- fisher.test(test.tab, alternative = l)
res <- list(pval = test.res$p.value, odds = test.res$estimate, type1_frac = f1, type2_frac = f2)
res

fisher.test(test.tab)

distal_peak,cluster,N
<chr>,<chr>,<int>
1:100009712-100010377,alpha_1,28
1:100009712-100010377,alpha_2,4


Unnamed: 0,alpha_1,alpha_2
Yes,28,4
No,5163,1023



	Fisher's Exact Test for Count Data

data:  test.tab
p-value = 0.8104
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.4837113 5.4531250
sample estimates:
odds ratio 
  1.386916 


In [14]:

fun.ftestPerTr <- function(  tr='1:100009712-100010377',#=1
                             celltypes=c('alpha_1','alpha_2'),
                             dat=dat.mat){
    

test.dat <- dat %>% filter(distal_peak == tr & cluster %in% celltypes)
 table.res<- test.dat$N; names(table.res) <- test.dat$cluster

    # handle if 0 for one subtype 
    a=setdiff(celltypes,    names(table.res))
    table.res[a]<-0
    
    test.tab <- matrix(c(table.res[celltypes[1]], table.res[celltypes[2]], 
                         dat.all.cells[celltypes[1]]-table.res[celltypes[1]], 
                         dat.all.cells[celltypes[2]]-table.res[celltypes[2]]),
                       byrow =  T,       
                       nrow = 2,
                       dimnames = list(expressed = c("Yes", "No"),
                       subtype = celltypes))

    f1<-test.tab[1]/(test.tab[1]+test.tab[2])
    f2 <- test.tab[3]/(test.tab[3]+test.tab[4])
    l <-ifelse(f1>f2,'greater','less')
    test.res <- fisher.test(test.tab,alternative = l)
    res <- list(
                distal_peak =tr,
                pval=test.res$p.value,
                odds=test.res$estimate,
                type1_frac=f1,
                type2_frac=f2
               )
    res
}

##

celltypes <- c('alpha_1','alpha_2')

system.time(fun.ftestPerTr(celltypes=celltypes))
t(fun.ftestPerTr(celltypes=celltypes))


   user  system elapsed 
  0.068   0.009   0.077 

distal_peak,pval,odds,type1_frac,type2_frac
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
1:100009712-100010377,0.3721211,1.386916,0.005393951,0.003894839


In [15]:

celltypes <- c('alpha_1','alpha_2')
dat.mat.sub <- dat.mat%>%filter(cluster %in% celltypes)
fun.ftestPerTr <- function(  tr='1:100009712-100010377',#=1
                             celltypes=c('alpha_1','alpha_2'),
                             dat=dat.mat.sub){
    

test.dat <- dat %>% filter(distal_peak == tr)
 table.res<- test.dat$N; names(table.res) <- test.dat$cluster

    # handle if 0 for one subtype 
    a=setdiff(celltypes,    names(table.res))
    table.res[a]<-0
    
    test.tab <- matrix(c(table.res[celltypes[1]], table.res[celltypes[2]], 
                         dat.all.cells[celltypes[1]]-table.res[celltypes[1]], 
                         dat.all.cells[celltypes[2]]-table.res[celltypes[2]]),
                       byrow =  T,       
                       nrow = 2,
                       dimnames = list(expressed = c("Yes", "No"),
                       subtype = celltypes))

    f1<-test.tab[1]/(test.tab[1]+test.tab[2])
    f2 <- test.tab[3]/(test.tab[3]+test.tab[4])
    l <-ifelse(f1>f2,'greater','less')
    test.res <- fisher.test(test.tab,alternative = l)
    res <- list(
                distal_peak =tr,
                pval=test.res$p.value,
                odds=test.res$estimate,
                type1_frac=f1,
                type2_frac=f2
               )
    res
}

##



system.time(fun.ftestPerTr(celltypes=celltypes))
t(fun.ftestPerTr(celltypes=celltypes))


   user  system elapsed 
  0.024   0.001   0.024 

distal_peak,pval,odds,type1_frac,type2_frac
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
1:100009712-100010377,0.3721211,1.386916,0.005393951,0.003894839


In [16]:

# estimate time
celltypes <- list()
celltypes$alpha <- c("alpha_1", "alpha_2")
celltypes$beta <- c("beta_1", "beta_2")
celltypes$delta <- c("delta_1", "delta_2")
sum(sapply( c("alpha", "beta", "delta"),function(x) 
    length(unique((dat.mat%>% filter(cluster %in% celltypes[[x]]))$distal_peak))))*.029/3600

In [17]:
celltypes <- list()
celltypes$alpha <- c("alpha_1", "alpha_2")
celltypes$beta <- c("beta_1", "beta_2")
celltypes$delta <- c("delta_1", "delta_2")

In [18]:
## test run

x<- "beta"
dat.mat.sub <- dat.mat%>%filter(cluster %in% celltypes[[x]])
all.tr <- unique(dat.mat.sub$distal_peak)
length(all.tr)
do.call(rbind,lapply(all.tr[1:10],function(trr) t(fun.ftestPerTr(dat = dat.mat.sub, 
        tr = trr, celltypes = celltypes[[x]]))))

distal_peak,pval,odds,type1_frac,type2_frac,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0,Unnamed: 20_level_0
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1:100009712-100010377,0.2891137,0.9213313,0.03020932,0.03270477,,,,,,,,,,,,,,,,
1:100014525-100015253,0.05388995,1.304241,0.02521408,0.01944608,,,,,,,,,,,,,,,,
1:100017588-100018137,4.131075e-06,0.1485343,0.001189343,0.007955215,,,,,,,,,,,,,,,,
1:100023302-100023931,0.003786372,0.3216857,0.00190295,0.005892752,,,,,,,,,,,,,,,,
1:100056228-100056811,0.008416978,0.6015636,0.009990485,0.01649971,,,,,,,,,,,,,,,,
1:100064582-100064908,0.6938878,0.8073071,0.0002378687,0.0002946376,,,,,,,,,,,,,,,,
1:100065306-100065506,0.4203372,0.4035669,0.0002378687,0.0005892752,,,,,,,,,,,,,,,,
1:10010367-10010795,0.02110914,1.520098,0.01736441,0.01149087,,,,,,,,,,,,,,,,
1:100113698-100114049,0.0002234783,1.671828,0.03449096,0.02091927,,,,,,,,,,,,,,,,
1:100122820-100123020,0.4466965,0.0,0.0,0.0002946376,,,,,,,,,,,,,,,,


In [19]:
require(parallel)

res.transcript_level <- list()
# time consumming task
system.time(for (x in c("alpha", "beta", "delta")) {
    
    dat.mat.sub <- dat.mat %>% filter(cluster %in% celltypes[[x]])
    all.tr <- unique(dat.mat.sub$distal_peak)
    
    res.transcript_level[[x]] <- do.call(rbind, mclapply(all.tr, function(trr) t(fun.ftestPerTr(dat = dat.mat.sub, 
        tr = trr, celltypes = celltypes[[x]])), mc.cores = 10))
})

saveRDS(object = res.transcript_level, file = "../dat/1901/res.distal.peaks.fisher.Rds")

Loading required package: parallel


    user   system  elapsed 
5507.796  925.831  653.546 

In [20]:
res.transcript_level <- rbind(data.frame(res.transcript_level$alpha,celltype="alpha"),
     data.frame(res.transcript_level$beta,celltype="beta"),
     data.frame(res.transcript_level$delta,celltype="delta"))


In [22]:
fwrite(res.transcript_level, file = "../dat/1910_v2/distal.peaks.sub_vs_sub.fisher.csv")
system('gzip -9 ../dat/1910_v2/distal.peaks.sub_vs_sub.fisher.csv')

In [6]:
res.transcript_level <- fread("gzcat ../dat/1910_v2/distal.peaks.sub_vs_sub.fisher.csv.gz")
res.transcript_level %>% head(1)
res.transcript_level %>% nrow

distal_peak,pval,odds,type1_frac,type2_frac,celltype
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1:100009712-100010377,0.3721211,1.386916,0.005393951,0.003894839,alpha


### 3.2 padj + filtering to ensure peaks only left present in tested cell types 

In [12]:
res.peak <- fread("../dat/1910_v2/islet.merged_peaks.anno.bed") %>% mutate(V1 = sub("chr", 
    "", V1)) %>% mutate(distal_peak = paste0(V1, ":", V2, "-", V3)) %>% filter(distal_peak %in% 
    res.transcript_level$distal_peak) %>% select(distal_peak, V4)
res.peak %>% head(1)
res.peak%>%nrow

distal_peak,V4
<chr>,<chr>
1:752578-752778,endothelial


In [13]:
# adjust for p value
m <- c("fdr", "bonferroni", "BY")
names(m) <- c("FDR", "padj.Bonferroni", "FDR.BY")
res.transcript_level.2 <- do.call(rbind, lapply(c("alpha", "beta", "delta"), function(ct) {
    # ct <- 'alpha'
    res.genes_level.a.df <- res.transcript_level %>% filter(celltype == ct) %>% select(distal_peak, 
        pval, celltype) %>% distinct %>% as.data.frame
    res.genes_level.a.df %>% nrow %>% print
    res.genes_level.a.df <- res.genes_level.a.df %>% filter(distal_peak %in% (res.peak %>% 
        filter(grepl(ct, V4)) %>% pull(distal_peak)))
    res.genes_level.a.df %>% nrow %>% print
    
    for (i in 1:3) {
        res.genes_level.a.df[[names(m)[i]]] <- p.adjust(as.numeric(res.genes_level.a.df$pval), 
            method = m[i])
    }
    res.genes_level.a.df
}))
head(res.transcript_level.2, 1)
head(res.transcript_level.2 %>% filter(celltype == "beta"), 1)
head(res.transcript_level.2 %>% filter(celltype == "delta"), 1)

[1] 203175
[1] 125016
[1] 204254
[1] 133294
[1] 152886
[1] 64188


distal_peak,pval,celltype,FDR,padj.Bonferroni,FDR.BY
<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>
1:100009712-100010377,0.3721211,alpha,0.4540326,1,1


distal_peak,pval,celltype,FDR,padj.Bonferroni,FDR.BY
<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>
1:100009712-100010377,0.2891137,beta,0.3668559,1,1


distal_peak,pval,celltype,FDR,padj.Bonferroni,FDR.BY
<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>
1:100023302-100023931,0.1995241,delta,0.4101238,1,1


In [14]:
res.transcript_level %>% nrow
res.transcript_level.2 %>% nrow
res.transcript_level %>% head(1)
res.transcript_level.2 %>% left_join(res.transcript_level %>% select(distal_peak, 
    celltype, type1_frac, type2_frac, odds) %>% distinct) %>% nrow
res.transcript_level.2 %>% left_join(res.transcript_level %>% select(distal_peak, 
    celltype, type1_frac, type2_frac, odds) %>% distinct) %>% head(1)

res.transcript_level.2 %>% left_join(res.transcript_level %>% select(distal_peak, 
    celltype, type1_frac, type2_frac,  odds) %>% distinct) %>% nrow

res.transcript_level.2 %>% left_join(res.transcript_level %>% select(distal_peak, 
    celltype, type1_frac, type2_frac, odds) %>% distinct) %>% head(1)


res.transcript_level.2 %>% left_join(res.transcript_level %>% select(distal_peak, 
    celltype, type1_frac, type2_frac,  odds) %>% distinct) %>% select(distal_peak, 
    celltype, type1_frac, type2_frac, odds, pval, FDR, padj.Bonferroni, FDR.BY) %>% 
    head(1)



distal_peak,pval,odds,type1_frac,type2_frac,celltype
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1:100009712-100010377,0.3721211,1.386916,0.005393951,0.003894839,alpha


Joining, by = c("distal_peak", "celltype")


Joining, by = c("distal_peak", "celltype")


distal_peak,pval,celltype,FDR,padj.Bonferroni,FDR.BY,type1_frac,type2_frac,odds
<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1:100009712-100010377,0.3721211,alpha,0.4540326,1,1,0.005393951,0.003894839,1.386916


Joining, by = c("distal_peak", "celltype")


Joining, by = c("distal_peak", "celltype")


distal_peak,pval,celltype,FDR,padj.Bonferroni,FDR.BY,type1_frac,type2_frac,odds
<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1:100009712-100010377,0.3721211,alpha,0.4540326,1,1,0.005393951,0.003894839,1.386916


Joining, by = c("distal_peak", "celltype")


distal_peak,celltype,type1_frac,type2_frac,odds,pval,FDR,padj.Bonferroni,FDR.BY
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1:100009712-100010377,alpha,0.005393951,0.003894839,1.386916,0.3721211,0.4540326,1,1


In [16]:

res.transcript_level.2 %>% left_join(res.transcript_level %>% select(distal_peak, 
    celltype, type1_frac, type2_frac,  odds) %>% distinct) %>% select(distal_peak, 
    celltype, type1_frac, type2_frac, odds, pval, FDR, padj.Bonferroni, FDR.BY) %>% 
    head(1)

Joining, by = c("distal_peak", "celltype")


distal_peak,celltype,type1_frac,type2_frac,odds,pval,FDR,padj.Bonferroni,FDR.BY
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1:100009712-100010377,alpha,0.005393951,0.003894839,1.386916,0.3721211,0.4540326,1,1


In [17]:
res.transcript_level<- res.transcript_level.2 %>% left_join(res.transcript_level %>% select(distal_peak, 
    celltype, type1_frac, type2_frac,  odds) %>% distinct) %>% select(distal_peak, 
    celltype, type1_frac, type2_frac, odds, pval, FDR, padj.Bonferroni, FDR.BY)

Joining, by = c("distal_peak", "celltype")


In [18]:
res.transcript_level%>%nrow

In [19]:
fwrite(res.transcript_level, file = "../dat/1910_v2/distal.peaks.sub_vs_sub.fisher.csv")
system('gzip -9 ../dat/1910_v2/distal.peaks.sub_vs_sub.fisher.csv')