In [2]:
require(data.table)
require(tidyverse)

## 1. Read data 

In [3]:
dat.mat <- fread('../dat/1901/alpha_beta.promoter.long_matrix_w_transcripts.txt')
dim(dat.mat)
head(dat.mat)
#dat.mat$cell <- NULL #no need cell id 

seq,start,end,gene,cluster,cell,transcript.idx
chr1,859052,860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTAGGCAGAAGTAAGGAGCAGGA,7
chr1,859052,860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTATGCGCAGCGTCTAATGGTTG,7
chr1,859052,860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTCGAGGCTGAAGGCTATGGTTG,7
chr1,859052,860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTCGTACTAGCTAAGCCTGTACT,7
chr1,859052,860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTGGACTCCTTCGACTAGGGTTG,7
chr1,859052,860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTGGAGCTACAAGGAGTAAGGCG,7


 Concepts: 
1. `promoter region`: -500bp + 500bp for all TSS in gencode 
2. `promoter Peaks`: peaks that overlap promoter region

In [4]:
# get all cells 
dat.all.cells <- table((fread('../dat/output.umap.ab.filtered.csv'))$cluster) # table for all cells
dat.all.cells
sum(dat.all.cells)
length(unique(dat.mat$cell))


alpha_1 alpha_2  beta_1  beta_2 
   4266    1328    4354    2816 

## 2 Prepare data

In [15]:
# prepare data
dat.mat.transcript_level <- dat.mat%>%
    select(-one_of("seq","start","end"))%>%
    distinct()%>%
    select(-cell)

#    unite("gene_tr.idx",c("gene","transcript.idx"),remove = T)
head(dat.mat.transcript_level)
dim(dat.mat.transcript_level)

gene,cluster,transcript.idx
SAMD11,alpha_1,7
SAMD11,alpha_1,7
SAMD11,alpha_2,7
SAMD11,alpha_2,7
SAMD11,alpha_1,7
SAMD11,alpha_2,7


In [13]:
cat("Check how rows changed:\n") 
cat(sprintf("Before applying uniquness, # of rows:%d\n",nrow(dat.mat)))
cat(sprintf("After applying uniquness, # of rows:%d\n",nrow(dat.mat.transcript_level)))

Check how rows changed:
Before applying uniquness, # of rows:19412387
After applying uniquness, # of rows:19381542


### 2.1 Special cases: TSS too close to two genes

- In this cases, the same peak may overlap with two genes' promoters
- Note: `foverlap` results selected the 1st matches. 

In [10]:
#fun.ftestPerGene(dat=dat.sub,tr = x)
x <- 4016
head(dat.mat.transcript_level%>% filter(transcript.idx==x))

## sed -n 4015,4017p alpha.transcript_promoter_peaks.bed
cat("chr11	105947325	105948873	AASDHPPT\n
chr11	105947325	105948873	KBTBD3")

gene,cluster,transcript.idx
AASDHPPT,alpha_1,4016
KBTBD3,alpha_1,4016
AASDHPPT,alpha_1,4016
KBTBD3,alpha_1,4016
AASDHPPT,alpha_2,4016
KBTBD3,alpha_2,4016


chr11	105947325	105948873	AASDHPPT

chr11	105947325	105948873	KBTBD3

In [16]:
celltypes <- c('alpha_1','alpha_2')
dat.sub <- dat.mat.transcript_level%>%
    filter(cluster %in%celltypes)


dat.sub.red <- dat.sub %>% select(-cluster)%>%group_by(transcript.idx)%>% unique()
setDT(dat.sub.red)
idx <- which(duplicated(dat.sub.red,by = "transcript.idx"))
head(dat.sub.red[idx,])
head(dat.sub.red[idx-1,])

gene,transcript.idx
SDF4,84
PUSL1,131
GLTPD1,157
RP4-758J18.2,223
SSU72,273
RER1,452


gene,transcript.idx
B3GALT6,84
ACAP3,131
CPSF3L,157
CCNL2,223
AL645728.1,273
MORN1,452


* In the above examples, each row is a peak 

#### 2.2.2 Handle these special cases by cat gene and tr.idx

In [17]:
dat.mat.transcript_level<-dat.mat.transcript_level%>%
    unite("gene_tr.idx",c("gene","transcript.idx"),remove = T)
head(dat.mat.transcript_level)

gene_tr.idx,cluster
SAMD11_7,alpha_1
SAMD11_7,alpha_1
SAMD11_7,alpha_2
SAMD11_7,alpha_2
SAMD11_7,alpha_1
SAMD11_7,alpha_2


## 3. Fisher's exact test at transcript level 

As long as there is any promoter peaks in that cell, the gene's promoter is open. 

1. get total alpha 1 and alpha 2 cells 
2. test hit in alpha1 vs hit in alpha2  (create contentigen table)
3. perform [Fisher's exact test](https://en.wikipedia.org/wiki/Fisher%27s_exact_test) or [chi-squared test](https://en.wikipedia.org/wiki/Chi-squared_test)

### 3.1 perform fisher's exact test for all transcripts

In [7]:
cat(sprintf("There are %d unique transcripts for alpha cells\n",
            length(unique((dat.mat.transcript_level%>%filter(cluster %in%c("alpha_1","alpha_2")))$transcript.idx))))
cat(sprintf("There are %d unique transcripts for beta cells\n",
            length(unique((dat.mat.transcript_level%>%filter(cluster %in%c("beta_1","beta_2")))$transcript.idx))))

There are 19665 unique transcripts for alpha cells
There are 19790 unique transcripts for beta cells


In [19]:
tr='SAMD11_7'
celltypes <- c('alpha_1','alpha_2')
dat.sub <- dat.mat.transcript_level%>%
    filter(cluster %in%celltypes)


test.dat <- table((dat.sub%>% filter(gene_tr.idx==tr))$cluster)
table.res <- as.vector(test.dat)
table.res
names(test.dat)

In [21]:
fun.ftestPerTr <- function(  tr='SAMD11_7',#=1
                             celltypes=c('alpha_1','alpha_2'),
                             dat=dat.mat.transcript_level%>%
                                  select(one_of("gene_tr.idx","cluster")%>%
                                  filter(cluster %in%celltypes))){
    

    test.dat <- table((dat.sub%>% filter(gene_tr.idx==tr))$cluster)
    table.res <- as.vector(test.dat)
    names(table.res) <- names(test.dat)
    # handle if 0 for one subtype 
    a=setdiff(celltypes,    names(table.res))
    table.res[a]<-0
    
    test.tab <- matrix(c(table.res[celltypes[1]], table.res[celltypes[2]], 
                         dat.all.cells[celltypes[1]]-table.res[celltypes[1]], 
                         dat.all.cells[celltypes[2]]-table.res[celltypes[2]]),
                       byrow =  T,       
                       nrow = 2,
                       dimnames = list(expressed = c("Yes", "No"),
                       subtype = celltypes))
    test.res <- fisher.test(test.tab)
    res <- list(pval=test.res$p.value/2,
                odds=test.res$estimate,
                type1_frac=test.tab[1]/(test.tab[1]+test.tab[2]),
                type2_frac=test.tab[3]/(test.tab[3]+test.tab[4])
               )
    res
}

##

celltypes <- c('alpha_1','alpha_2')
dat.sub <- dat.mat.transcript_level%>%
                           select(one_of("gene_tr.idx","cluster"))%>%
                           filter(cluster %in%celltypes)
#fun.ftestPerGene(dat = dat.sub,tr=1)
system.time(fun.ftestPerTr(celltypes=celltypes))
fun.ftestPerTr(celltypes=celltypes)
system.time(fun.ftestPerTr(dat=dat.sub,celltypes=celltypes))  
fun.ftestPerTr(dat=dat.sub,celltypes=celltypes)
#all.tr <- unique(dat.sub$transcript.idx)
#for(x in all.tr){
#    fun.ftestPerGene(dat=dat.sub,tr = x)
#}



#fun.ftestPerGene(dat = dat.sub,tr=1)

   user  system elapsed 
  0.064   0.002   0.073 

   user  system elapsed 
  0.052   0.001   0.055 

In [22]:
celltypes <- c('beta_1','beta_2')
dat.sub <- dat.mat.transcript_level%>%
                           select(one_of("gene_tr.idx","cluster"))%>%
                           filter(cluster %in%celltypes)
fun.ftestPerTr(celltypes=celltypes)
fun.ftestPerTr(dat=dat.sub,celltypes=celltypes)


In [38]:
celltypes<- list()
celltypes$alpha <- c('alpha_1','alpha_2')
celltypes$beta <- c('beta_1','beta_2')
res.transcript_level <- list()
# time consumming task
system.time(for(x in c("alpha","beta")){
    require(parallel)
    dat.sub <- dat.mat.transcript_level%>%select(one_of("gene_tr.idx","cluster"))%>%filter(cluster %in% celltypes[[x]])
    all.tr <- unique(dat.sub$gene_tr.idx)
    res.transcript_level[[x]] <- (mclapply(all.tr,function(trr) fun.ftestPerTr(dat=dat.sub,
                                                            tr = trr,
                                                            celltypes = celltypes[[x]]),mc.cores = 8))
})
        

    user   system  elapsed 
1407.150  202.741 1247.019 

In [42]:
head(res.transcript_level$alpha)

Unnamed: 0,pval,odds,type1_frac,type2_frac
SAMD11_7,0.0008038729,0.5408936,0.01945617,0.03539157
SAMD11_13,0.102674,0.6931602,0.008907642,0.0128012
SAMD11_14,0.000387331,0.4795206,0.01429911,0.02936747
NOC2L_19,0.1934535,0.8726655,0.0403188,0.04593373
KLHL17_23,0.2498725,0.8278814,0.01312705,0.01581325
PLEKHN1_27,0.000104741,0.43971,0.01312705,0.02936747


In [39]:
# alpha
x <- celltypes$alpha
dat.sub <- dat.mat.transcript_level%>%select(one_of("gene_tr.idx","cluster"))%>%filter(cluster %in% x)
all.tr <- unique(dat.sub$gene_tr.idx)
res.genes_level.a.df <- do.call(rbind,res.transcript_level$alpha)
rownames(res.genes_level.a.df) <- all.tr
head(res.genes_level.a.df)
res.transcript_level$alpha <-res.genes_level.a.df 

Unnamed: 0,pval,odds,type1_frac,type2_frac
SAMD11_7,0.0008038729,0.5408936,0.01945617,0.03539157
SAMD11_13,0.102674,0.6931602,0.008907642,0.0128012
SAMD11_14,0.000387331,0.4795206,0.01429911,0.02936747
NOC2L_19,0.1934535,0.8726655,0.0403188,0.04593373
KLHL17_23,0.2498725,0.8278814,0.01312705,0.01581325
PLEKHN1_27,0.000104741,0.43971,0.01312705,0.02936747


In [43]:
# beta
x <- celltypes$beta
dat.sub <- dat.mat.transcript_level%>%select(one_of("gene_tr.idx","cluster"))%>%filter(cluster %in% x)
all.tr <- unique(dat.sub$gene_tr.idx)
res.genes_level.a.df <- do.call(rbind,res.transcript_level$beta)
rownames(res.genes_level.a.df) <- all.tr
head(res.genes_level.a.df)
res.transcript_level$beta <-res.genes_level.a.df 

Unnamed: 0,pval,odds,type1_frac,type2_frac
SAMD11_7,0.4757446,1.012853,0.0413413,0.04083807
SAMD11_8,0.01671483,1.805673,0.01148369,0.006392045
SAMD11_13,0.002476792,1.409152,0.05029858,0.03622159
SAMD11_14,9.622286e-08,1.45374,0.1566376,0.1132812
NOC2L_17,1.421091e-10,1.948879,0.07854846,0.04190341
NOC2L_19,0.3060026,0.9384924,0.03674782,0.0390625


In [45]:
# adjust for p value 
m<- c("fdr","bonferroni","BY");names(m)<-c("FDR","padj.Bonferroni","FDR.BY")
res.transcript_level.2 <- lapply(res.transcript_level,function(df){
    res.genes_level.a.df <- as.data.frame(df)
    for(i in 1:3){
        res.genes_level.a.df[[names(m)[i]]] <- p.adjust(as.numeric(res.genes_level.a.df$pval),method = m[i])
    }
    res.genes_level.a.df$odds <- as.numeric(res.genes_level.a.df$odds)
    res.genes_level.a.df
})
head(res.transcript_level.2$alpha)
head(res.transcript_level.2$beta)

Unnamed: 0,pval,odds,type1_frac,type2_frac,FDR,padj.Bonferroni,FDR.BY
SAMD11_7,0.0008038729,0.5408936,0.01945617,0.03539157,0.002748127,1,0.029028453
SAMD11_13,0.102674,0.6931602,0.008907642,0.0128012,0.15438067,1,1.0
SAMD11_14,0.000387331,0.4795206,0.01429911,0.02936747,0.001484497,1,0.015680743
NOC2L_19,0.1934535,0.8726655,0.0403188,0.04593373,0.252822598,1,1.0
KLHL17_23,0.2498725,0.8278814,0.01312705,0.01581325,0.307508135,1,1.0
PLEKHN1_27,0.000104741,0.43971,0.01312705,0.02936747,0.000484751,1,0.005120424


Unnamed: 0,pval,odds,type1_frac,type2_frac,FDR,padj.Bonferroni,FDR.BY
SAMD11_7,0.4757446,1.0128525,0.0413413,0.04083807,0.491137,1.0,1.0
SAMD11_8,0.01671483,1.8056728,0.01148369,0.006392045,0.04365739,1.0,0.4613735
SAMD11_13,0.002476792,1.4091523,0.05029858,0.03622159,0.010318,1.0,0.1090411
SAMD11_14,9.622286e-08,1.4537397,0.1566376,0.1132812,3.153249e-06,0.002100064,3.332369e-05
NOC2L_17,1.421091e-10,1.9488789,0.07854846,0.04190341,1.308663e-08,3.10153e-06,1.383001e-07
NOC2L_19,0.3060026,0.9384924,0.03674782,0.0390625,0.3664676,1.0,1.0


In [46]:
res.transcript_level <- res.transcript_level.2
saveRDS(res.transcript_level,"../dat/1901/res.transcript_level.rds")