In [2]:
require(data.table)
require(tidyverse)

## 1. Read data 

In [12]:
dat.mat <- fread("../dat/1910_v2//long_matrix_w_transcripts.txt") %>% filter(!is.na(tr.idx))
dim(dat.mat)
head(dat.mat)
# dat.mat$cell <- NULL #no need cell id

seq,start,end,cell,value,tr.idx,gene
chr10,100027284,100028604,Islet1fresh_AGACACCTAAGAGGCAGCGTAAGAAGGCG,2,9209,LOXL4
chr10,100027284,100028604,Islet1fresh_AGACACCTACTCGCTAGTAAGGAGCAGGA,2,9209,LOXL4
chr10,100027284,100028604,Islet1fresh_AGACACCTACTGAGCGATCTGAGTGTACT,2,9209,LOXL4
chr10,100027284,100028604,Islet1fresh_AGACACCTATCTCAGGTATCCTCTCCTAT,2,9209,LOXL4
chr10,100027284,100028604,Islet1fresh_AGACACCTATGCGCAGAAGGAGTAGTACT,1,9209,LOXL4
chr10,100027284,100028604,Islet1fresh_AGACACCTCGAGGCTGCTCTCTATGGTTG,2,9209,LOXL4


 Concepts: 
1. `promoter region`: -500bp + 500bp for all TSS in gencode 
2. `promoter Peaks`: peaks that overlap promoter region

In [4]:
# get all cells 
dat.all.cells <- table((fread('../dat/1910_v2/islet.cluster_labels.filt.txt'))$cluster) # table for all cells
dat.all.cells
sum(dat.all.cells)
length(unique(dat.mat$cell))


     acinar     alpha_1     alpha_2      beta_1      beta_2     delta_1 
         46        5191        1027        4204        3394         478 
    delta_2      ductal endothelial       gamma      immune    stellate 
        232          80         118         260         140         128 

In [13]:
dat.mat<- dat.mat %>% left_join(fread("../dat/1910_v2/islet.cluster_labels.filt.txt") %>% select(index, 
    cluster_name), by = c(cell = "index"))

## 2 Prepare data

In [14]:
# prepare data
dat.mat.transcript_level <- dat.mat%>%
    select(-one_of("seq","start","end",'value'))%>%
    distinct()%>%
    select(-cell)

#    unite("gene_tr.idx",c("gene","transcript.idx"),remove = T)
head(dat.mat.transcript_level)
dim(dat.mat.transcript_level)

tr.idx,gene,cluster_name
9209,LOXL4,alpha_1
9209,LOXL4,alpha_1
9209,LOXL4,beta_2
9209,LOXL4,alpha_1
9209,LOXL4,alpha_1
9209,LOXL4,alpha_1


In [15]:
cat("Check how rows changed:\n") 
cat(sprintf("Before applying uniquness, # of rows:%d\n",nrow(dat.mat)))
cat(sprintf("After applying uniquness, # of rows:%d\n",nrow(dat.mat.transcript_level)))

Check how rows changed:
Before applying uniquness, # of rows:18758757
After applying uniquness, # of rows:18723382


### 2.1 Special cases: TSS too close to two genes

- In this cases, the same peak may overlap with two genes' promoters
- Note: `foverlap` results selected the 1st matches. 

In [10]:
#fun.ftestPerGene(dat=dat.sub,tr = x)
x <- 4016
head(dat.mat.transcript_level%>% filter(transcript.idx==x))

## sed -n 4015,4017p alpha.transcript_promoter_peaks.bed
cat("chr11	105947325	105948873	AASDHPPT\n
chr11	105947325	105948873	KBTBD3")

gene,cluster,transcript.idx
AASDHPPT,alpha_1,4016
KBTBD3,alpha_1,4016
AASDHPPT,alpha_1,4016
KBTBD3,alpha_1,4016
AASDHPPT,alpha_2,4016
KBTBD3,alpha_2,4016


chr11	105947325	105948873	AASDHPPT

chr11	105947325	105948873	KBTBD3

In [18]:
celltypes <- c('alpha_1','alpha_2')
dat.sub <- dat.mat.transcript_level%>%
    filter(cluster_name %in%celltypes)


dat.sub.red <- dat.sub %>% select(-cluster_name)%>%group_by(tr.idx)%>% unique()
setDT(dat.sub.red)
idx <- which(duplicated(dat.sub.red,by = "tr.idx"))
head(dat.sub.red[idx,])
head(dat.sub.red[idx-1,])

tr.idx,gene


tr.idx,gene


#####  In the above examples, each row is a peak 

#### 2.2.2 Handle these special cases by cat gene and tr.idx

In [19]:
dat.mat.transcript_level<-dat.mat.transcript_level%>%
    unite("gene_tr.idx",c("gene","tr.idx"),remove = T)
head(dat.mat.transcript_level)

gene_tr.idx,cluster_name
LOXL4_9209,alpha_1
LOXL4_9209,alpha_1
LOXL4_9209,beta_2
LOXL4_9209,alpha_1
LOXL4_9209,alpha_1
LOXL4_9209,alpha_1


## 3. Fisher's exact test at transcript level 

As long as there is any promoter peaks in that cell, the gene's promoter is open. 

1. get total alpha 1 and alpha 2 cells 
2. test hit in alpha1 vs hit in alpha2  (create contentigen table)
3. perform [Fisher's exact test](https://en.wikipedia.org/wiki/Fisher%27s_exact_test) or [chi-squared test](https://en.wikipedia.org/wiki/Chi-squared_test)

### 3.1 perform fisher's exact test for all transcripts

In [21]:
cat(sprintf("There are %d unique transcripts for alpha cells\n",
            length(unique((dat.mat.transcript_level%>%filter(cluster_name %in%c("alpha_1","alpha_2")))$gene_tr.idx))))
cat(sprintf("There are %d unique transcripts for beta cells\n",
            length(unique((dat.mat.transcript_level%>%filter(cluster_name %in%c("beta_1","beta_2")))$gene_tr.idx))))
cat(sprintf("There are %d unique transcripts for delta cells\n",
            length(unique((dat.mat.transcript_level%>%filter(cluster_name %in%c("delta_1","delta_2")))$gene_tr.idx))))

There are 18106 unique transcripts for alpha cells
There are 18116 unique transcripts for beta cells
There are 17257 unique transcripts for beta cells


In [23]:
tr='LOXL4_9209'
celltypes <- c('alpha_1','alpha_2')
dat.sub <- dat.mat.transcript_level%>%
    filter(cluster_name %in%celltypes)


    test.dat <- table((dat.sub%>% filter(gene_tr.idx==tr))$cluster)
    table.res <- as.vector(test.dat)
    names(table.res) <- names(test.dat)
    # handle if 0 for one subtype 
    a=setdiff(celltypes,    names(table.res))
    table.res[a]<-0
    
    test.tab <- matrix(c(table.res[celltypes[1]], table.res[celltypes[2]], 
                         dat.all.cells[celltypes[1]]-table.res[celltypes[1]], 
                         dat.all.cells[celltypes[2]]-table.res[celltypes[2]]),
                       byrow =  T,       
                       nrow = 2,
                       dimnames = list(expressed = c("Yes", "No"),
                       subtype = celltypes))
test.tab
test.tab[1]
test.tab[2]
    f1<-test.tab[1]/(test.tab[1]+test.tab[2])
    f2 <- test.tab[3]/(test.tab[3]+test.tab[4])
    l <-ifelse(f1>f2,'greater','less')
    test.res <- fisher.test(test.tab,alternative = l)
    res <- list(pval=test.res$p.value,
                odds=test.res$estimate,
                type1_frac=f1,
                type2_frac=f2
               )
    res

fisher.test(test.tab)

Unnamed: 0,alpha_1,alpha_2
Yes,391,60
No,4800,967



	Fisher's Exact Test for Count Data

data:  test.tab
p-value = 0.05634
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.988697 1.768290
sample estimates:
odds ratio 
  1.312778 


In [26]:
fun.ftestPerTr <- function(  tr='SAMD11_7',#=1
                             celltypes=c('alpha_1','alpha_2'),
                             dat=dat.mat.transcript_level%>%
                                  select(one_of("gene_tr.idx","cluster_name")%>%
                                  filter(cluster_name %in%celltypes))){
    

    test.dat <- table((dat.sub%>% filter(gene_tr.idx==tr))$cluster)
    table.res <- as.vector(test.dat)
    names(table.res) <- names(test.dat)
    # handle if 0 for one subtype 
    a=setdiff(celltypes,    names(table.res))
    table.res[a]<-0
    
    test.tab <- matrix(c(table.res[celltypes[1]], table.res[celltypes[2]], 
                         dat.all.cells[celltypes[1]]-table.res[celltypes[1]], 
                         dat.all.cells[celltypes[2]]-table.res[celltypes[2]]),
                       byrow =  T,       
                       nrow = 2,
                       dimnames = list(expressed = c("Yes", "No"),
                       subtype = celltypes))

    f1<-test.tab[1]/(test.tab[1]+test.tab[2])
    f2 <- test.tab[3]/(test.tab[3]+test.tab[4])
    l <-ifelse(f1>f2,'greater','less')
    test.res <- fisher.test(test.tab,alternative = l)
    res <- list(pval=test.res$p.value,
                odds=test.res$estimate,
                type1_frac=f1,
                type2_frac=f2
               )
    res
}

##

celltypes <- c('alpha_1','alpha_2')
dat.sub <- dat.mat.transcript_level%>%
                           select(one_of("gene_tr.idx","cluster_name"))%>%
                           filter(cluster_name %in%celltypes)
#fun.ftestPerGene(dat = dat.sub,tr=1)
system.time(fun.ftestPerTr(celltypes=celltypes))
fun.ftestPerTr(celltypes=celltypes)
system.time(fun.ftestPerTr(dat=dat.sub,celltypes=celltypes))  
fun.ftestPerTr(dat=dat.sub,celltypes=celltypes)
#all.tr <- unique(dat.sub$transcript.idx)
#for(x in all.tr){
#    fun.ftestPerGene(dat=dat.sub,tr = x)
#}



#fun.ftestPerGene(dat = dat.sub,tr=1)

   user  system elapsed 
  0.074   0.008   0.082 

   user  system elapsed 
  0.054   0.002   0.057 

In [27]:
celltypes <- c('beta_1','beta_2')
dat.sub <- dat.mat.transcript_level%>%
                           select(one_of("gene_tr.idx","cluster_name"))%>%
                           filter(cluster_name %in%celltypes)
fun.ftestPerTr(celltypes=celltypes)
fun.ftestPerTr(dat=dat.sub,celltypes=celltypes)


In [28]:
celltypes <- list()
celltypes$alpha <- c("alpha_1", "alpha_2")
celltypes$beta <- c("beta_1", "beta_2")
celltypes$delta <- c("delta_1", "delta_2")
res.transcript_level <- list()
# time consumming task
system.time(for (x in c("alpha", "beta", "delta")) {
    require(parallel)
    dat.sub <- dat.mat.transcript_level %>% select(one_of("gene_tr.idx", "cluster_name")) %>% 
        filter(cluster_name %in% celltypes[[x]])
    all.tr <- unique(dat.sub$gene_tr.idx)
    res.transcript_level[[x]] <- (mclapply(all.tr, function(trr) fun.ftestPerTr(dat = dat.sub, 
        tr = trr, celltypes = celltypes[[x]]), mc.cores = 8))
})

Loading required package: parallel


    user   system  elapsed 
4885.699 1733.654 1297.098 

In [30]:
# alpha
x <- celltypes$alpha
dat.sub <- dat.mat.transcript_level%>%select(one_of("gene_tr.idx","cluster_name"))%>%filter(cluster_name %in% x)
all.tr <- unique(dat.sub$gene_tr.idx)
res.genes_level.a.df <- do.call(rbind,res.transcript_level$alpha)
rownames(res.genes_level.a.df) <- all.tr
head(res.genes_level.a.df)
res.transcript_level$alpha <-res.genes_level.a.df 

Unnamed: 0,pval,odds,type1_frac,type2_frac
LOXL4_9209,0.03035653,1.312778,0.07532267,0.05842259
PYROXD2_9210,8.217327e-05,2.534973,0.03621653,0.01460565
HPS1_9211,0.1883507,1.667152,0.008090927,0.004868549
HPS1_9212,3.985771e-06,0.2241556,0.004430746,0.0194742
HPS1_9213,0.005588597,0.6754621,0.04295897,0.06231743
CNNM1_9220,0.0008534694,0.7174467,0.1017145,0.1363194


In [31]:
# beta
x <- celltypes$beta
dat.sub <- dat.mat.transcript_level%>%select(one_of("gene_tr.idx","cluster_name"))%>%filter(cluster_name %in% x)
all.tr <- unique(dat.sub$gene_tr.idx)
res.genes_level.a.df <- do.call(rbind,res.transcript_level$beta)
rownames(res.genes_level.a.df) <- all.tr
head(res.genes_level.a.df)
res.transcript_level$beta <-res.genes_level.a.df 

Unnamed: 0,pval,odds,type1_frac,type2_frac
LOXL4_9209,0.006368447,1.516484,0.02568982,0.01708898
PYROXD2_9210,0.2433923,0.6452322,0.00190295,0.002946376
HPS1_9211,0.1278564,0.2016509,0.0002378687,0.00117855
HPS1_9212,0.4846571,1.345902,0.001189343,0.0008839128
HPS1_9213,0.0354369,1.279869,0.03591817,0.02828521
CNNM1_9220,0.4681654,1.009449,0.09657469,0.09575722


In [33]:
# delta
x <- celltypes$delta
dat.sub <- dat.mat.transcript_level%>%select(one_of("gene_tr.idx","cluster_name"))%>%filter(cluster_name %in% x)
all.tr <- unique(dat.sub$gene_tr.idx)
res.genes_level.a.df <- do.call(rbind,res.transcript_level$delta)
rownames(res.genes_level.a.df) <- all.tr
head(res.genes_level.a.df)
res.transcript_level$delta <-res.genes_level.a.df 

Unnamed: 0,pval,odds,type1_frac,type2_frac
LOXL4_9209,0.5679042,1.052973,0.02719665,0.02586207
PYROXD2_9210,0.2502162,0.2416201,0.00209205,0.00862069
HPS1_9211,0.3267606,0.0,0.0,0.004310345
HPS1_9212,0.3267606,0.0,0.0,0.004310345
HPS1_9213,0.038763,2.746818,0.0460251,0.01724138
CNNM1_9220,0.2975085,0.843879,0.09623431,0.112069


In [34]:
# adjust for p value 
m<- c("fdr","bonferroni","BY");names(m)<-c("FDR","padj.Bonferroni","FDR.BY")
res.transcript_level.2 <- lapply(res.transcript_level,function(df){
    res.genes_level.a.df <- as.data.frame(df)
    for(i in 1:3){
        res.genes_level.a.df[[names(m)[i]]] <- p.adjust(as.numeric(res.genes_level.a.df$pval),method = m[i])
    }
    res.genes_level.a.df$odds <- as.numeric(res.genes_level.a.df$odds)
    res.genes_level.a.df
})
head(res.transcript_level.2$alpha,1)
head(res.transcript_level.2$beta,1)
head(res.transcript_level.2$delta,1)

Unnamed: 0,pval,odds,type1_frac,type2_frac,FDR,padj.Bonferroni,FDR.BY
LOXL4_9209,0.03035653,1.312778,0.07532267,0.05842259,0.07060184,1,0.7329348


Unnamed: 0,pval,odds,type1_frac,type2_frac,FDR,padj.Bonferroni,FDR.BY
LOXL4_9209,0.006368447,1.516484,0.02568982,0.01708898,0.04274813,1,0.4438023


Unnamed: 0,pval,odds,type1_frac,type2_frac,FDR,padj.Bonferroni,FDR.BY
LOXL4_9209,0.5679042,1.052973,0.02719665,0.02586207,0.6096245,1,1


In [35]:
res.transcript_level <- res.transcript_level.2
saveRDS(res.transcript_level,"../dat/1910_v2/prom.ttest.transcript_level.rds")

### adding back peak info

In [36]:
res <- res.transcript_level

In [37]:
head(res$alpha,1)
dim(res$alpha)

Unnamed: 0,pval,odds,type1_frac,type2_frac,FDR,padj.Bonferroni,FDR.BY
LOXL4_9209,0.03035653,1.312778,0.07532267,0.05842259,0.07060184,1,0.7329348


In [39]:
# prepare data
dat.mat.transcript_level <- dat.mat %>% select(-one_of("cell", "cluster_name", "value")) %>% 
    distinct() %>% unite("V1", c("gene", "tr.idx"), sep = "_") %>% unite("tmp", c("start", 
    "end"), sep = "-") %>% unite("peak", c("seq", "tmp"), sep = ":")

# unite('gene_tr.idx',c('gene','transcript.idx'),remove = T)
head(dat.mat.transcript_level)
dim(dat.mat.transcript_level)

peak,V1
chr10:100027284-100028604,LOXL4_9209
chr10:100174567-100175450,PYROXD2_9210
chr10:100190809-100191248,HPS1_9211
chr10:100194089-100194460,HPS1_9212
chr10:100205796-100206886,HPS1_9213
chr10:101088824-101090586,CNNM1_9220


In [41]:
res.2 <- dat.mat.transcript_level%>% 
    rename(gene_transcript=V1)%>%
    right_join(res$alpha%>%rownames_to_column("gene_transcript"))%>%
    mutate(percent_open_state1=as.numeric(type1_frac)*100,
          percent_open_state2=as.numeric(type2_frac)*100)%>%
    select(-ends_with("_frac"))
dim(res.2)
head(res.2)
fwrite(res.2,"../figures/Fig2/subfigs/fig_2.prom_alpha_volcano.csv")


Joining, by = "gene_transcript"


peak,gene_transcript,pval,odds,FDR,padj.Bonferroni,FDR.BY,percent_open_state1,percent_open_state2
chr10:100027284-100028604,LOXL4_9209,0.03035653,1.3127775,0.07060184,1.0,0.7329347674,7.5322674,5.842259
chr10:100174567-100175450,PYROXD2_9210,8.217327e-05,2.5349729,0.0005965634,1.0,0.0061930693,3.6216529,1.4605648
chr10:100190809-100191248,HPS1_9211,0.1883507,1.6671522,0.2774614,1.0,1.0,0.8090927,0.4868549
chr10:100194089-100194460,HPS1_9212,3.985771e-06,0.2241556,4.983866e-05,0.07216638,0.0005173872,0.4430746,1.9474197
chr10:100205796-100206886,HPS1_9213,0.005588597,0.6754621,0.01836427,1.0,0.1906439455,4.2958967,6.2317429
chr10:101088824-101090586,CNNM1_9220,0.0008534694,0.7174467,0.004105451,1.0,0.0426196777,10.1714506,13.6319377


In [42]:
res.2 <- dat.mat.transcript_level%>% 
    rename(gene_transcript=V1)%>%
    right_join(res$beta%>%rownames_to_column("gene_transcript"))%>%
    mutate(percent_open_state1=as.numeric(type1_frac)*100,
          percent_open_state2=as.numeric(type2_frac)*100)%>%
    select(-ends_with("_frac"))
dim(res.2)
head(res.2)
fwrite(res.2,"../figures/Fig2/subfigs/fig_2.prom_beta_volcano.csv")


Joining, by = "gene_transcript"


peak,gene_transcript,pval,odds,FDR,padj.Bonferroni,FDR.BY,percent_open_state1,percent_open_state2
chr10:100027284-100028604,LOXL4_9209,0.006368447,1.5164838,0.04274813,1,0.4438023,2.56898192,1.70889806
chr10:100174567-100175450,PYROXD2_9210,0.2433923,0.6452322,0.38082702,1,1.0,0.19029496,0.2946376
chr10:100190809-100191248,HPS1_9211,0.1278564,0.2016509,0.27068726,1,1.0,0.02378687,0.11785504
chr10:100194089-100194460,HPS1_9212,0.4846571,1.3459021,0.52094924,1,1.0,0.11893435,0.08839128
chr10:100205796-100206886,HPS1_9213,0.0354369,1.2798685,0.12826052,1,1.0,3.59181732,2.82852092
chr10:101088824-101090586,CNNM1_9220,0.4681654,1.0094488,0.51242459,1,1.0,9.65746908,9.57572186


In [43]:
res.2 <- dat.mat.transcript_level%>% 
    rename(gene_transcript=V1)%>%
    right_join(res$delta%>%rownames_to_column("gene_transcript"))%>%
    mutate(percent_open_state1=as.numeric(type1_frac)*100,
          percent_open_state2=as.numeric(type2_frac)*100)%>%
    select(-ends_with("_frac"))
dim(res.2)
head(res.2)
fwrite(res.2,"../figures/Fig2/subfigs/fig_2.prom_delta_volcano.csv")


Joining, by = "gene_transcript"


peak,gene_transcript,pval,odds,FDR,padj.Bonferroni,FDR.BY,percent_open_state1,percent_open_state2
chr10:100027284-100028604,LOXL4_9209,0.5679042,1.0529732,0.6096245,1,1,2.719665,2.5862069
chr10:100174567-100175450,PYROXD2_9210,0.2502162,0.2416201,0.4410153,1,1,0.209205,0.862069
chr10:100190809-100191248,HPS1_9211,0.3267606,0.0,0.4919224,1,1,0.0,0.4310345
chr10:100194089-100194460,HPS1_9212,0.3267606,0.0,0.4919224,1,1,0.0,0.4310345
chr10:100205796-100206886,HPS1_9213,0.038763,2.7468176,0.1802724,1,1,4.60251,1.7241379
chr10:101088824-101090586,CNNM1_9220,0.2975085,0.843879,0.4854331,1,1,9.623431,11.2068966
