In [2]:
require(data.table)
require(tidyverse)

## 1. Read data 

In [3]:
dat.mat <- fread('../dat/1901/alpha_beta.promoter.long_matrix_w_transcripts.txt')
dim(dat.mat)
head(dat.mat)
#dat.mat$cell <- NULL #no need cell id 

seq,start,end,gene,cluster,cell,transcript.idx
chr1,859052,860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTAGGCAGAAGTAAGGAGCAGGA,7
chr1,859052,860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTATGCGCAGCGTCTAATGGTTG,7
chr1,859052,860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTCGAGGCTGAAGGCTATGGTTG,7
chr1,859052,860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTCGTACTAGCTAAGCCTGTACT,7
chr1,859052,860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTGGACTCCTTCGACTAGGGTTG,7
chr1,859052,860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTGGAGCTACAAGGAGTAAGGCG,7


 Concepts: 
1. `promoter region`: -500bp + 500bp for all TSS in gencode 
2. `promoter Peaks`: peaks that overlap promoter region

In [4]:
# get all cells 
dat.all.cells <- table((fread('../dat/output.umap.ab.filtered.csv'))$cluster) # table for all cells
dat.all.cells
sum(dat.all.cells)
length(unique(dat.mat$cell))


alpha_1 alpha_2  beta_1  beta_2 
   4266    1328    4354    2816 

## 2 Prepare data

In [5]:
# prepare data
dat.mat.transcript_level <- dat.mat%>%
    select(-one_of("seq","start","end"))%>%
    distinct()%>%
    select(-cell)

#    unite("gene_tr.idx",c("gene","transcript.idx"),remove = T)
head(dat.mat.transcript_level)
dim(dat.mat.transcript_level)

gene,cluster,transcript.idx
SAMD11,alpha_1,7
SAMD11,alpha_1,7
SAMD11,alpha_2,7
SAMD11,alpha_2,7
SAMD11,alpha_1,7
SAMD11,alpha_2,7


In [6]:
cat("Check how rows changed:\n") 
cat(sprintf("Before applying uniquness, # of rows:%d\n",nrow(dat.mat)))
cat(sprintf("After applying uniquness, # of rows:%d\n",nrow(dat.mat.transcript_level)))

Check how rows changed:
Before applying uniquness, # of rows:19412387
After applying uniquness, # of rows:19381542


### 2.1 Special cases: TSS too close to two genes

- In this cases, the same peak may overlap with two genes' promoters
- Note: `foverlap` results selected the 1st matches. 

In [10]:
#fun.ftestPerGene(dat=dat.sub,tr = x)
x <- 4016
head(dat.mat.transcript_level%>% filter(transcript.idx==x))

## sed -n 4015,4017p alpha.transcript_promoter_peaks.bed
cat("chr11	105947325	105948873	AASDHPPT\n
chr11	105947325	105948873	KBTBD3")

gene,cluster,transcript.idx
AASDHPPT,alpha_1,4016
KBTBD3,alpha_1,4016
AASDHPPT,alpha_1,4016
KBTBD3,alpha_1,4016
AASDHPPT,alpha_2,4016
KBTBD3,alpha_2,4016


chr11	105947325	105948873	AASDHPPT

chr11	105947325	105948873	KBTBD3

In [7]:
celltypes <- c('alpha_1','alpha_2')
dat.sub <- dat.mat.transcript_level%>%
    filter(cluster %in%celltypes)


dat.sub.red <- dat.sub %>% select(-cluster)%>%group_by(transcript.idx)%>% unique()
setDT(dat.sub.red)
idx <- which(duplicated(dat.sub.red,by = "transcript.idx"))
head(dat.sub.red[idx,])
head(dat.sub.red[idx-1,])

gene,transcript.idx
SDF4,84
PUSL1,131
GLTPD1,157
RP4-758J18.2,223
SSU72,273
RER1,452


gene,transcript.idx
B3GALT6,84
ACAP3,131
CPSF3L,157
CCNL2,223
AL645728.1,273
MORN1,452


#####  In the above examples, each row is a peak 

#### 2.2.2 Handle these special cases by cat gene and tr.idx

In [8]:
dat.mat.transcript_level<-dat.mat.transcript_level%>%
    unite("gene_tr.idx",c("gene","transcript.idx"),remove = T)
head(dat.mat.transcript_level)

gene_tr.idx,cluster
SAMD11_7,alpha_1
SAMD11_7,alpha_1
SAMD11_7,alpha_2
SAMD11_7,alpha_2
SAMD11_7,alpha_1
SAMD11_7,alpha_2


## 3. Fisher's exact test at transcript level 

As long as there is any promoter peaks in that cell, the gene's promoter is open. 

1. get total alpha 1 and alpha 2 cells 
2. test hit in alpha1 vs hit in alpha2  (create contentigen table)
3. perform [Fisher's exact test](https://en.wikipedia.org/wiki/Fisher%27s_exact_test) or [chi-squared test](https://en.wikipedia.org/wiki/Chi-squared_test)

### 3.1 perform fisher's exact test for all transcripts

In [11]:
cat(sprintf("There are %d unique transcripts for alpha cells\n",
            length(unique((dat.mat.transcript_level%>%filter(cluster %in%c("alpha_1","alpha_2")))$gene_tr.idx))))
cat(sprintf("There are %d unique transcripts for beta cells\n",
            length(unique((dat.mat.transcript_level%>%filter(cluster %in%c("beta_1","beta_2")))$gene_tr.idx))))

There are 21715 unique transcripts for alpha cells
There are 21825 unique transcripts for beta cells


In [19]:
tr='SAMD11_7'
celltypes <- c('alpha_1','alpha_2')
dat.sub <- dat.mat.transcript_level%>%
    filter(cluster %in%celltypes)


    test.dat <- table((dat.sub%>% filter(gene_tr.idx==tr))$cluster)
    table.res <- as.vector(test.dat)
    names(table.res) <- names(test.dat)
    # handle if 0 for one subtype 
    a=setdiff(celltypes,    names(table.res))
    table.res[a]<-0
    
    test.tab <- matrix(c(table.res[celltypes[1]], table.res[celltypes[2]], 
                         dat.all.cells[celltypes[1]]-table.res[celltypes[1]], 
                         dat.all.cells[celltypes[2]]-table.res[celltypes[2]]),
                       byrow =  T,       
                       nrow = 2,
                       dimnames = list(expressed = c("Yes", "No"),
                       subtype = celltypes))
test.tab
test.tab[1]
test.tab[2]
    f1<-test.tab[1]/(test.tab[1]+test.tab[2])
    f2 <- test.tab[3]/(test.tab[3]+test.tab[4])
    l <-ifelse(f1>f2,'greater','less')
    test.res <- fisher.test(test.tab,alternative = l)
    res <- list(pval=test.res$p.value,
                odds=test.res$estimate,
                type1_frac=f1,
                type2_frac=f2
               )
    res

fisher.test(test.tab)

Unnamed: 0,alpha_1,alpha_2
Yes,83,47
No,4183,1281



	Fisher's Exact Test for Count Data

data:  test.tab
p-value = 0.001608
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.3715137 0.7954274
sample estimates:
odds ratio 
 0.5408936 


In [20]:
fun.ftestPerTr <- function(  tr='SAMD11_7',#=1
                             celltypes=c('alpha_1','alpha_2'),
                             dat=dat.mat.transcript_level%>%
                                  select(one_of("gene_tr.idx","cluster")%>%
                                  filter(cluster %in%celltypes))){
    

    test.dat <- table((dat.sub%>% filter(gene_tr.idx==tr))$cluster)
    table.res <- as.vector(test.dat)
    names(table.res) <- names(test.dat)
    # handle if 0 for one subtype 
    a=setdiff(celltypes,    names(table.res))
    table.res[a]<-0
    
    test.tab <- matrix(c(table.res[celltypes[1]], table.res[celltypes[2]], 
                         dat.all.cells[celltypes[1]]-table.res[celltypes[1]], 
                         dat.all.cells[celltypes[2]]-table.res[celltypes[2]]),
                       byrow =  T,       
                       nrow = 2,
                       dimnames = list(expressed = c("Yes", "No"),
                       subtype = celltypes))

    f1<-test.tab[1]/(test.tab[1]+test.tab[2])
    f2 <- test.tab[3]/(test.tab[3]+test.tab[4])
    l <-ifelse(f1>f2,'greater','less')
    test.res <- fisher.test(test.tab,alternative = l)
    res <- list(pval=test.res$p.value,
                odds=test.res$estimate,
                type1_frac=f1,
                type2_frac=f2
               )
    res
}

##

celltypes <- c('alpha_1','alpha_2')
dat.sub <- dat.mat.transcript_level%>%
                           select(one_of("gene_tr.idx","cluster"))%>%
                           filter(cluster %in%celltypes)
#fun.ftestPerGene(dat = dat.sub,tr=1)
system.time(fun.ftestPerTr(celltypes=celltypes))
fun.ftestPerTr(celltypes=celltypes)
system.time(fun.ftestPerTr(dat=dat.sub,celltypes=celltypes))  
fun.ftestPerTr(dat=dat.sub,celltypes=celltypes)
#all.tr <- unique(dat.sub$transcript.idx)
#for(x in all.tr){
#    fun.ftestPerGene(dat=dat.sub,tr = x)
#}



#fun.ftestPerGene(dat = dat.sub,tr=1)

   user  system elapsed 
  0.068   0.001   0.069 

   user  system elapsed 
  0.053   0.001   0.054 

In [21]:
celltypes <- c('beta_1','beta_2')
dat.sub <- dat.mat.transcript_level%>%
                           select(one_of("gene_tr.idx","cluster"))%>%
                           filter(cluster %in%celltypes)
fun.ftestPerTr(celltypes=celltypes)
fun.ftestPerTr(dat=dat.sub,celltypes=celltypes)


In [22]:
celltypes<- list()
celltypes$alpha <- c('alpha_1','alpha_2')
celltypes$beta <- c('beta_1','beta_2')
res.transcript_level <- list()
# time consumming task
system.time(for(x in c("alpha","beta")){
    require(parallel)
    dat.sub <- dat.mat.transcript_level%>%select(one_of("gene_tr.idx","cluster"))%>%filter(cluster %in% celltypes[[x]])
    all.tr <- unique(dat.sub$gene_tr.idx)
    res.transcript_level[[x]] <- (mclapply(all.tr,function(trr) fun.ftestPerTr(dat=dat.sub,
                                                            tr = trr,
                                                            celltypes = celltypes[[x]]),mc.cores = 8))
})
        

Loading required package: parallel


    user   system  elapsed 
2590.538  355.081  815.598 

In [42]:
head(res.transcript_level$alpha)

Unnamed: 0,pval,odds,type1_frac,type2_frac
SAMD11_7,0.0008038729,0.5408936,0.01945617,0.03539157
SAMD11_13,0.102674,0.6931602,0.008907642,0.0128012
SAMD11_14,0.000387331,0.4795206,0.01429911,0.02936747
NOC2L_19,0.1934535,0.8726655,0.0403188,0.04593373
KLHL17_23,0.2498725,0.8278814,0.01312705,0.01581325
PLEKHN1_27,0.000104741,0.43971,0.01312705,0.02936747


In [24]:
# alpha
x <- celltypes$alpha
dat.sub <- dat.mat.transcript_level%>%select(one_of("gene_tr.idx","cluster"))%>%filter(cluster %in% x)
all.tr <- unique(dat.sub$gene_tr.idx)
res.genes_level.a.df <- do.call(rbind,res.transcript_level$alpha)
rownames(res.genes_level.a.df) <- all.tr
head(res.genes_level.a.df)
res.transcript_level$alpha <-res.genes_level.a.df 

Unnamed: 0,pval,odds,type1_frac,type2_frac
SAMD11_7,0.0008745605,0.5408936,0.01945617,0.03539157
SAMD11_13,0.137223,0.6931602,0.008907642,0.0128012
SAMD11_14,0.0004316489,0.4795206,0.01429911,0.02936747
NOC2L_19,0.2061189,0.8726655,0.0403188,0.04593373
KLHL17_23,0.2698448,0.8278814,0.01312705,0.01581325
PLEKHN1_27,0.0001216835,0.43971,0.01312705,0.02936747


In [25]:
# beta
x <- celltypes$beta
dat.sub <- dat.mat.transcript_level%>%select(one_of("gene_tr.idx","cluster"))%>%filter(cluster %in% x)
all.tr <- unique(dat.sub$gene_tr.idx)
res.genes_level.a.df <- do.call(rbind,res.transcript_level$beta)
rownames(res.genes_level.a.df) <- all.tr
head(res.genes_level.a.df)
res.transcript_level$beta <-res.genes_level.a.df 

Unnamed: 0,pval,odds,type1_frac,type2_frac
SAMD11_7,0.4840908,1.012853,0.0413413,0.04083807
SAMD11_8,0.01847768,1.805673,0.01148369,0.006392045
SAMD11_13,0.002659781,1.409152,0.05029858,0.03622159
SAMD11_14,1.028895e-07,1.45374,0.1566376,0.1132812
NOC2L_17,1.524891e-10,1.948879,0.07854846,0.04190341
NOC2L_19,0.3290328,0.9384924,0.03674782,0.0390625


In [43]:
# beta
x <- celltypes$beta
dat.sub <- dat.mat.transcript_level%>%select(one_of("gene_tr.idx","cluster"))%>%filter(cluster %in% x)
all.tr <- unique(dat.sub$gene_tr.idx)
res.genes_level.a.df <- do.call(rbind,res.transcript_level$beta)
rownames(res.genes_level.a.df) <- all.tr
head(res.genes_level.a.df)
res.transcript_level$beta <-res.genes_level.a.df 

Unnamed: 0,pval,odds,type1_frac,type2_frac
SAMD11_7,0.4757446,1.012853,0.0413413,0.04083807
SAMD11_8,0.01671483,1.805673,0.01148369,0.006392045
SAMD11_13,0.002476792,1.409152,0.05029858,0.03622159
SAMD11_14,9.622286e-08,1.45374,0.1566376,0.1132812
NOC2L_17,1.421091e-10,1.948879,0.07854846,0.04190341
NOC2L_19,0.3060026,0.9384924,0.03674782,0.0390625


In [26]:
# adjust for p value 
m<- c("fdr","bonferroni","BY");names(m)<-c("FDR","padj.Bonferroni","FDR.BY")
res.transcript_level.2 <- lapply(res.transcript_level,function(df){
    res.genes_level.a.df <- as.data.frame(df)
    for(i in 1:3){
        res.genes_level.a.df[[names(m)[i]]] <- p.adjust(as.numeric(res.genes_level.a.df$pval),method = m[i])
    }
    res.genes_level.a.df$odds <- as.numeric(res.genes_level.a.df$odds)
    res.genes_level.a.df
})
head(res.transcript_level.2$alpha)
head(res.transcript_level.2$beta)

Unnamed: 0,pval,odds,type1_frac,type2_frac,FDR,padj.Bonferroni,FDR.BY
SAMD11_7,0.0008745605,0.5408936,0.01945617,0.03539157,0.0030385731,1,0.03209644
SAMD11_13,0.137223,0.6931602,0.008907642,0.0128012,0.1971548379,1,1.0
SAMD11_14,0.0004316489,0.4795206,0.01429911,0.02936747,0.0016642854,1,0.017579842
NOC2L_19,0.2061189,0.8726655,0.0403188,0.04593373,0.2701353235,1,1.0
KLHL17_23,0.2698448,0.8278814,0.01312705,0.01581325,0.3316704506,1,1.0
PLEKHN1_27,0.0001216835,0.43971,0.01312705,0.02936747,0.0005668516,1,0.005987652


Unnamed: 0,pval,odds,type1_frac,type2_frac,FDR,padj.Bonferroni,FDR.BY
SAMD11_7,0.4840908,1.0128525,0.0413413,0.04083807,0.5020328,1.0,1.0
SAMD11_8,0.01847768,1.8056728,0.01148369,0.006392045,0.04819257,1.0,0.5093015
SAMD11_13,0.002659781,1.4091523,0.05029858,0.03622159,0.0111763,1.0,0.1181117
SAMD11_14,1.028895e-07,1.4537397,0.1566376,0.1132812,3.417906e-06,0.002245564,3.61206e-05
NOC2L_17,1.524891e-10,1.9488789,0.07854846,0.04190341,1.410201e-08,3.328075e-06,1.490308e-07
NOC2L_19,0.3290328,0.9384924,0.03674782,0.0390625,0.3910231,1.0,1.0


In [27]:
res.transcript_level <- res.transcript_level.2
saveRDS(res.transcript_level,"../dat/1901/res.transcript_level.rds")

### adding back peak info

In [4]:
res<- readRDS("../dat/1901/res.transcript_level.rds")

In [5]:
head(res$alpha)
dim(res$alpha)

Unnamed: 0,pval,odds,type1_frac,type2_frac,FDR,padj.Bonferroni,FDR.BY
SAMD11_7,0.0008745605,0.5408936,0.01945617,0.03539157,0.0030385731,1,0.03209644
SAMD11_13,0.137223,0.6931602,0.008907642,0.0128012,0.1971548379,1,1.0
SAMD11_14,0.0004316489,0.4795206,0.01429911,0.02936747,0.0016642854,1,0.017579842
NOC2L_19,0.2061189,0.8726655,0.0403188,0.04593373,0.2701353235,1,1.0
KLHL17_23,0.2698448,0.8278814,0.01312705,0.01581325,0.3316704506,1,1.0
PLEKHN1_27,0.0001216835,0.43971,0.01312705,0.02936747,0.0005668516,1,0.005987652


In [7]:
# prepare data
dat.mat.transcript_level <- dat.mat%>%
    select(-one_of("cell","cluster"))%>%
    distinct()%>%
    unite("V1",c("gene","transcript.idx"),sep = "_")%>%
    unite("tmp",c("start","end"),sep = "-")%>%
    unite("peak",c("seq","tmp"),sep = ":")

#    unite("gene_tr.idx",c("gene","transcript.idx"),remove = T)
head(dat.mat.transcript_level)
dim(dat.mat.transcript_level)

peak,V1
chr1:859052-860562,SAMD11_7
chr1:875573-875966,SAMD11_13
chr1:876931-878016,SAMD11_14
chr1:894277-895102,NOC2L_19
chr1:895801-896103,KLHL17_23
chr1:901696-902721,PLEKHN1_27


In [13]:
res.2 <- dat.mat.transcript_level%>% 
    rename(gene_transcript=V1)%>%
    right_join(res$alpha%>%rownames_to_column("gene_transcript"))%>%
    mutate(percent_open_state1=as.numeric(type1_frac)*100,
          percent_open_state2=as.numeric(type2_frac)*100)%>%
    select(-ends_with("_frac"))
dim(res.2)
head(res.2)
fwrite(res.2,"/Users/frank/Dropbox (UCSD_Epigenomics)/Islet_snATAC/panel_pdfs/sfigs/fig_2.prom_alpha_volcano.csv")


Joining, by = "gene_transcript"


peak,gene_transcript,pval,odds,FDR,padj.Bonferroni,FDR.BY,percent_open_state1,percent_open_state2
chr1:859052-860562,SAMD11_7,0.0008745605,0.5408936,0.0030385731,1,0.03209644,1.9456165,3.539157
chr1:875573-875966,SAMD11_13,0.137223,0.6931602,0.1971548379,1,1.0,0.8907642,1.28012
chr1:876931-878016,SAMD11_14,0.0004316489,0.4795206,0.0016642854,1,0.017579842,1.4299109,2.936747
chr1:894277-895102,NOC2L_19,0.2061189,0.8726655,0.2701353235,1,1.0,4.03188,4.593373
chr1:895801-896103,KLHL17_23,0.2698448,0.8278814,0.3316704506,1,1.0,1.3127051,1.581325
chr1:901696-902721,PLEKHN1_27,0.0001216835,0.43971,0.0005668516,1,0.005987652,1.3127051,2.936747


In [14]:
res.2 <- dat.mat.transcript_level%>% 
    rename(gene_transcript=V1)%>%
    right_join(res$beta%>%rownames_to_column("gene_transcript"))%>%
    mutate(percent_open_state1=as.numeric(type1_frac)*100,
          percent_open_state2=as.numeric(type2_frac)*100)%>%
    select(-ends_with("_frac"))
dim(res.2)
head(res.2)
fwrite(res.2,"/Users/frank/Dropbox (UCSD_Epigenomics)/Islet_snATAC/panel_pdfs/sfigs/fig_2.prom_beta_volcano.csv")


Joining, by = "gene_transcript"


peak,gene_transcript,pval,odds,FDR,padj.Bonferroni,FDR.BY,percent_open_state1,percent_open_state2
chr1:859052-860562,SAMD11_7,0.4840908,1.0128525,0.5020328,1.0,1.0,4.13413,4.0838068
chr1:860918-861175,SAMD11_8,0.01847768,1.8056728,0.04819257,1.0,0.5093015,1.148369,0.6392045
chr1:875573-875966,SAMD11_13,0.002659781,1.4091523,0.0111763,1.0,0.1181117,5.029858,3.6221591
chr1:876931-878016,SAMD11_14,1.028895e-07,1.4537397,3.417906e-06,0.002245564,3.61206e-05,15.663757,11.328125
chr1:879743-880454,NOC2L_17,1.524891e-10,1.9488789,1.410201e-08,3.328075e-06,1.490308e-07,7.854846,4.1903409
chr1:894277-895102,NOC2L_19,0.3290328,0.9384924,0.3910231,1.0,1.0,3.674782,3.90625
