In [2]:
require(data.table)
require(tidyverse)

## 1. Read data 

In [3]:
dat.mat <- fread('../dat/1901/delta.promoter.long_matrix_w_transcripts_corrected.txt')
dim(dat.mat)
head(dat.mat)
#dat.mat$cell <- NULL #no need cell id 

seq,start,end,gene,cluster,barcode,transcript.idx
chr1,859052,860562,SAMD11,delta_1,Islet3-fresh_CGGCTATGGCTCATGACTAAGCCTGTACTGAC,7
chr1,859052,860562,SAMD11,delta_1,Islet3-fresh_AGACACCTACTCGCTATCTCTCCGGTACTGAC,7
chr1,859052,860562,SAMD11,delta_1,Islet3-fresh_AGACACCTACTCGCTATCTCTCCGGTACTGAC,7
chr1,859052,860562,SAMD11,delta_1,Islet2-fresh_TCGAGGCACGTACTAGTCTAGCTAGGC,7
chr1,859052,860562,SAMD11,delta_1,Islet3-fresh_AGCGATAGTAGGCATGTTATGCGATATAGCCT,7
chr1,859052,860562,SAMD11,delta_1,Islet3-fresh_CGGCTATGGCTCATGACTAAGCCTGTACTGAC,7


 Concepts: 
1. `promoter region`: -500bp + 500bp for all TSS in gencode 
2. `promoter Peaks`: peaks that overlap promoter region

## 2 Prepare data

In [4]:
# prepare data
dat.mat.transcript_level <- dat.mat%>%
    select(-one_of("seq","start","end"))%>%
    distinct()%>%
    select(-barcode)

#    unite("gene_tr.idx",c("gene","transcript.idx"),remove = T)
head(dat.mat.transcript_level)
dim(dat.mat.transcript_level)

gene,cluster,transcript.idx
SAMD11,delta_1,7
SAMD11,delta_1,7
SAMD11,delta_1,7
SAMD11,delta_1,7
SAMD11,delta_1,7
SAMD11,delta_1,7


In [6]:
cat("Check how rows changed:\n") 
cat(sprintf("Before applying uniquness, # of rows:%d\n",nrow(dat.mat)))
cat(sprintf("After applying uniquness, # of rows:%d\n",nrow(dat.mat.transcript_level)))

Check how rows changed:
Before applying uniquness, # of rows:2330095
After applying uniquness, # of rows:1009259


### 2.1 Special cases: TSS too close to two genes

- In this cases, the same peak may overlap with two genes' promoters
- Note: `foverlap` results selected the 1st matches. 

In [5]:
celltypes <- c('delta_1','delta_2')
dat.sub <- dat.mat.transcript_level%>%
    filter(cluster %in%celltypes)


dat.sub.red <- dat.sub %>% select(-cluster)%>%group_by(transcript.idx)%>% unique()
setDT(dat.sub.red)
idx <- which(duplicated(dat.sub.red,by = "transcript.idx"))
head(dat.sub.red[idx,])
head(dat.sub.red[idx-1,])

gene,transcript.idx
B3GALT6,84
PUSL1,131
GLTPD1,157
RP4-758J18.2,223
AL645728.1,273
RER1,452


gene,transcript.idx
SDF4,84
ACAP3,131
CPSF3L,157
CCNL2,223
SSU72,273
MORN1,452


In [14]:
dat.mat[,-"barcode"]%>%filter(transcript.idx==84)%>%distinct()

seq,start,end,gene,cluster,transcript.idx
chr1,1166826,1167987,SDF4,delta_2,84
chr1,1166826,1167987,SDF4,delta_1,84
chr1,1166826,1167987,B3GALT6,delta_2,84
chr1,1166826,1167987,B3GALT6,delta_1,84


#####  In the above examples, each row is a peak 

#### 2.2.2 Handle these special cases by cat gene and tr.idx

In [6]:
dat.mat.transcript_level<-dat.mat.transcript_level%>%
    unite("gene_tr.idx",c("gene","transcript.idx"),remove = T)
head(dat.mat.transcript_level)

gene_tr.idx,cluster
SAMD11_7,delta_1
SAMD11_7,delta_1
SAMD11_7,delta_1
SAMD11_7,delta_1
SAMD11_7,delta_1
SAMD11_7,delta_1


## 3. Fisher's exact test at transcript level 

As long as there is any promoter peaks in that cell, the gene's promoter is open. 

1. get total alpha 1 and alpha 2 cells 
2. test hit in alpha1 vs hit in alpha2  (create contentigen table)
3. perform [Fisher's exact test](https://en.wikipedia.org/wiki/Fisher%27s_exact_test) or [chi-squared test](https://en.wikipedia.org/wiki/Chi-squared_test)

### 3.1 perform fisher's exact test for all transcripts

In [7]:
cat(sprintf("There are %d unique transcripts for delta cells\n",
            length(unique((dat.mat.transcript_level%>%filter(cluster %in%c("delta_1","delta_2")))$gene_tr.idx))))

cat(sprintf("There are %d delta cells\n",
            length(unique(dat.mat$barcode))))


There are 18547 unique transcripts for delta cells
There are 723 delta cells


In [8]:
dat.all.cells<- table((dat.mat[,c("cluster","barcode")]%>%unique)$cluster)
dat.all.cells


delta_1 delta_2 
    688      35 

In [9]:
tr='SAMD11_7'
celltypes <- c('delta_1','delta_2')
dat.sub <- dat.mat.transcript_level%>%
    filter(cluster %in%celltypes)


    test.dat <- table((dat.sub%>% filter(gene_tr.idx==tr))$cluster)
    table.res <- as.vector(test.dat)
    names(table.res) <- names(test.dat)
    # handle if 0 for one subtype 
    a=setdiff(celltypes,    names(table.res))
    table.res[a]<-0
    
    test.tab <- matrix(c(table.res[celltypes[1]], table.res[celltypes[2]], 
                         dat.all.cells[celltypes[1]]-table.res[celltypes[1]], 
                         dat.all.cells[celltypes[2]]-table.res[celltypes[2]]),
                       byrow =  T,       
                       nrow = 2,
                       dimnames = list(expressed = c("Yes", "No"),
                       subtype = celltypes))
test.tab
test.tab[1]
test.tab[2]
    f1<-test.tab[1]/(test.tab[1]+test.tab[2])
    f2 <- test.tab[3]/(test.tab[3]+test.tab[4])
    l <-ifelse(f1>f2,'greater','less')
    test.res <- fisher.test(test.tab,alternative = l)
    res <- list(pval=test.res$p.value,
                odds=test.res$estimate,
                type1_frac=f1,
                type2_frac=f2
               )
    res

fisher.test(test.tab)

Unnamed: 0,delta_1,delta_2
Yes,12,2
No,676,33



	Fisher's Exact Test for Count Data

data:  test.tab
p-value = 0.1439
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.06143683 2.80886784
sample estimates:
odds ratio 
 0.2937532 


In [11]:
fun.ftestPerTr <- function(  tr='SAMD11_7',#=1
                             celltypes=c('alpha_1','alpha_2'),
                             dat=dat.mat.transcript_level%>%
                                  select(one_of("gene_tr.idx","cluster")%>%
                                  filter(cluster %in%celltypes))){
    

    test.dat <- table((dat.sub%>% filter(gene_tr.idx==tr))$cluster)
    table.res <- as.vector(test.dat)
    names(table.res) <- names(test.dat)
    # handle if 0 for one subtype 
    a=setdiff(celltypes,    names(table.res))
    table.res[a]<-0
    
    test.tab <- matrix(c(table.res[celltypes[1]], table.res[celltypes[2]], 
                         dat.all.cells[celltypes[1]]-table.res[celltypes[1]], 
                         dat.all.cells[celltypes[2]]-table.res[celltypes[2]]),
                       byrow =  T,       
                       nrow = 2,
                       dimnames = list(expressed = c("Yes", "No"),
                       subtype = celltypes))

    f1<-test.tab[1]/(test.tab[1]+test.tab[2])
    f2 <- test.tab[3]/(test.tab[3]+test.tab[4])
    l <-ifelse(f1>f2,'greater','less')
    test.res <- fisher.test(test.tab,alternative = l)
    res <- list(pval=test.res$p.value,
                odds=test.res$estimate,
                type1_frac=f1,
                type2_frac=f2
               )
    res
}

##

celltypes <- c('delta_1','delta_2')
dat.sub <- dat.mat.transcript_level%>%
                           select(one_of("gene_tr.idx","cluster"))%>%
                           filter(cluster %in%celltypes)
#fun.ftestPerGene(dat = dat.sub,tr=1)
system.time(fun.ftestPerTr(celltypes=celltypes))
fun.ftestPerTr(celltypes=celltypes)
system.time(fun.ftestPerTr(dat=dat.sub,celltypes=celltypes))  
fun.ftestPerTr(dat=dat.sub,celltypes=celltypes)
#all.tr <- unique(dat.sub$transcript.idx)
#for(x in all.tr){
#    fun.ftestPerGene(dat=dat.sub,tr = x)
#}



#fun.ftestPerGene(dat = dat.sub,tr=1)

   user  system elapsed 
  0.027   0.001   0.027 

   user  system elapsed 
  0.009   0.000   0.010 

In [12]:
celltypes<- list()
celltypes$delta <- c('delta_1','delta_2')

res.transcript_level <- list()
# time consumming task
system.time(for(x in c("delta")){
    require(parallel)
    dat.sub <- dat.mat.transcript_level%>%select(one_of("gene_tr.idx","cluster"))%>%filter(cluster %in% celltypes[[x]])
    all.tr <- unique(dat.sub$gene_tr.idx)
    res.transcript_level[[x]] <- (mclapply(all.tr,function(trr) fun.ftestPerTr(dat=dat.sub,
                                                            tr = trr,
                                                            celltypes = celltypes[[x]]),mc.cores = 8))
})
        

   user  system elapsed 
209.493  25.998  34.092 

In [13]:
# delta
x <- celltypes$delta
dat.sub <- dat.mat.transcript_level%>%select(one_of("gene_tr.idx","cluster"))%>%filter(cluster %in% x)
all.tr <- unique(dat.sub$gene_tr.idx)
res.genes_level.a.df <- do.call(rbind,res.transcript_level$delta)
rownames(res.genes_level.a.df) <- all.tr
head(res.genes_level.a.df)
res.transcript_level$delta <-res.genes_level.a.df 

Unnamed: 0,pval,odds,type1_frac,type2_frac
SAMD11_7,0.1439338,0.2937532,0.01744186,0.05714286
SAMD11_8,0.7417201,inf,0.00872093,0.0
SAMD11_13,0.6343316,0.9656582,0.02761628,0.02857143
SAMD11_14,0.304654,0.6067625,0.05377907,0.08571429
NOC2L_19,0.2425178,inf,0.04069767,0.0
PLEKHN1_27,0.3289739,0.3503028,0.01017442,0.02857143


In [14]:
# adjust for p value 
m<- c("fdr","bonferroni","BY");names(m)<-c("FDR","padj.Bonferroni","FDR.BY")
res.transcript_level.2 <- lapply(res.transcript_level,function(df){
    res.genes_level.a.df <- as.data.frame(df)
    for(i in 1:3){
        res.genes_level.a.df[[names(m)[i]]] <- p.adjust(as.numeric(res.genes_level.a.df$pval),method = m[i])
    }
    res.genes_level.a.df$odds <- as.numeric(res.genes_level.a.df$odds)
    res.genes_level.a.df
})
head(res.transcript_level.2$delta)


Unnamed: 0,pval,odds,type1_frac,type2_frac,FDR,padj.Bonferroni,FDR.BY
SAMD11_7,0.1439338,0.2937532,0.01744186,0.05714286,0.7106693,1,1
SAMD11_8,0.7417201,inf,0.00872093,0.0,0.7682294,1,1
SAMD11_13,0.6343316,0.9656582,0.02761628,0.02857143,0.7140624,1,1
SAMD11_14,0.304654,0.6067625,0.05377907,0.08571429,0.7106693,1,1
NOC2L_19,0.2425178,inf,0.04069767,0.0,0.7106693,1,1
PLEKHN1_27,0.3289739,0.3503028,0.01017442,0.02857143,0.7106693,1,1


In [15]:
fwrite(res.transcript_level.2$delta,"../dat/1901/res.transcript_level_delta.csv",row.names = T)