In [2]:
require(data.table)
require(tidyverse)

## 1. Rationale

* due to the sparseness, the ps is binned to 100 (can change) 
* then chisq test wheter the ratio of openning is dependent on bins 

##  2. load data

In [28]:
dat.pro_ps<- fread("../dat/1901/alpha_beta.promoter.long_matrix_w_transcripts_ps.txt")%>%
    separate(cluster,into = c("celltype","subtype"))
head(dat.pro_ps)
tail(dat.pro_ps)
dim(dat.pro_ps)

gene_tr.idx,celltype,subtype,cell,ps
SAMD11_7,alpha,1,Islet1-fresh_AGACACCTAGGCAGAAGTAAGGAGCAGGA,5.643735
SAMD11_7,alpha,1,Islet1-fresh_AGACACCTATGCGCAGCGTCTAATGGTTG,1.056578
SAMD11_7,alpha,2,Islet1-fresh_AGACACCTCGAGGCTGAAGGCTATGGTTG,6.254863
SAMD11_7,alpha,2,Islet1-fresh_AGACACCTCGTACTAGCTAAGCCTGTACT,6.508574
SAMD11_7,alpha,1,Islet1-fresh_AGACACCTGGACTCCTTCGACTAGGGTTG,2.12067
SAMD11_7,alpha,2,Islet1-fresh_AGACACCTGGAGCTACAAGGAGTAAGGCG,5.564368


gene_tr.idx,celltype,subtype,cell,ps
EIF1AY_133795,beta,1,Islet3-fresh_TCCGGAGAGGAGCTACTCGACTAGGTACTGAC,3.157942
EIF1AY_133795,beta,1,Islet3-fresh_TCCGGAGAGTAGAGGACTAAGCCTGGCTCTGA,4.901059
EIF1AY_133795,beta,2,Islet3-fresh_TCCGGAGATACGCTGCAAGGCTATGGTTGCGT,18.923257
EIF1AY_133795,beta,1,Islet3-fresh_TCCGGAGATAGGCATGTTCTAGCTATAGAGGC,4.65522
EIF1AY_133795,beta,2,Islet3-fresh_TCCGGAGATCGACGTCCTCTCTATCAGGACGT,14.077267
EIF1AY_133795,beta,1,Islet3-fresh_TCCGGAGATCGACGTCCTCTCTATGGTTGCGT,8.113267


### 2.1 bin data 

In [29]:
require(tidyverse)
ps.range <- list()
ps.range$alpha <- range((dat.pro_ps %>%filter(celltype =="alpha"))$ps)
ps.range$beta <- range((dat.pro_ps %>%filter(celltype=="beta"))$ps)
ps.range

In [31]:
N.bin<- 101
ps.bks <- list()
ps.bks$alpha <- seq(ps.range$alpha[1],ps.range$alpha[2],length.out = N.bin)
ps.bks$beta <- seq(ps.range$beta[1],ps.range$beta[2],length.out = N.bin)

dat.pro_ps <- rbind(
    dat.pro_ps%>% 
    filter(celltype=="alpha")%>%
    mutate(ps_bin =cut(ps,breaks = ps.bks$alpha,include.lowest = T)),
    dat.pro_ps%>% 
    filter(celltype=="beta")%>%
    mutate(ps_bin =cut(ps,breaks = ps.bks$beta,include.lowest = T))
)

ps.bin.dat.sum<- dat.pro_ps%>%
select(-one_of("gene_tr.idx","ps"))%>%distinct()

setDT(ps.bin.dat.sum)
ps.bin.dat.sum<-ps.bin.dat.sum[, .(.N), by = .(ps_bin,celltype)]
cat(paste(N.bin,"# bins:\n"))
cat("quantiles of alpha cell number per bin:\n")
t(quantile((ps.bin.dat.sum%>%filter(celltype=="alpha"))$N))
cat("quantiles of beta cell number per bin:\n")
t(quantile((ps.bin.dat.sum%>%filter(celltype=="beta"))$N))

101 # bins:
quantiles of alpha cell number per bin:


0%,25%,50%,75%,100%
1,21,41,71.25,257


quantiles of beta cell number per bin:


0%,25%,50%,75%,100%
2,18.75,33.5,77,655


In [32]:
ps.bin.dat.sum%>%group_by(celltype)%>%summarise(tot=sum(N))

celltype,tot
alpha,5594
beta,7170


## 3. filter on glist

In [14]:
dy.glist<- readRDS('../dat/1901/dy.glist.Rds')
all.tr <- unique(c(dy.glist$alpha_tr,dy.glist$beta_tr))
length(all.tr)
lapply(dy.glist,length)

In [33]:
#gene_tr.idx	cluster	
dim(dat.pro_ps)
dat.pro_ps.filtered <- rbind(
    dat.pro_ps%>% filter(gene_tr.idx%in% dy.glist$alpha_tr,celltype=="alpha"),
    dat.pro_ps%>% filter(gene_tr.idx%in% dy.glist$beta_tr,celltype=="beta")
)
dim(dat.pro_ps.filtered)

## 4. Calc percent_open & smooth

### 4.1 convert to percetage openning per bin

In [34]:
head(dat.pro_ps.filtered)

gene_tr.idx,celltype,subtype,cell,ps,ps_bin
HES4_34,alpha,1,Islet1-fresh_AGACACCTATCTCAGGTCGACTAGTATAG,0.5915286,"(0.54,0.607]"
HES4_34,alpha,2,Islet1-fresh_AGACACCTATGCGCAGCTCTCTATAGGCG,6.2034799,"(6.14,6.21]"
HES4_34,alpha,2,Islet1-fresh_AGACACCTCGAGGCTGAAGGCTATGGTTG,6.2548629,"(6.21,6.27]"
HES4_34,alpha,2,Islet1-fresh_AGACACCTCGAGGCTGGCGTAAGAGTACT,6.4780116,"(6.48,6.54]"
HES4_34,alpha,2,Islet1-fresh_AGACACCTCGATCAGTAAGGAGTAATAGA,4.720515,"(4.65,4.72]"
HES4_34,alpha,2,Islet1-fresh_AGACACCTCGTACTAGCTAAGCCTGTACT,6.5085744,"(6.48,6.54]"


In [36]:
dat.pro_ps.long <- dat.pro_ps.filtered%>%
    left_join(ps.bin.dat.sum)
dim(dat.pro_ps.long)
head(dat.pro_ps.long%>%arrange(gene_tr.idx,ps_bin))

Joining, by = c("celltype", "ps_bin")


gene_tr.idx,celltype,subtype,cell,ps,ps_bin,N
AAAS_28691,alpha,1,Islet2-fresh_AGATCTTCATGCGCAGAGGAGTAAGGC,0.2992474,"(0.27,0.337]",34
AAAS_28691,alpha,1,Islet2-fresh_CAGTTGCACCTAAGACGTCTAATCTGA,0.2918434,"(0.27,0.337]",34
AAAS_28691,alpha,1,Islet2-fresh_ACATTGGCTCGACGTCTAAGCCTTCCT,0.3450805,"(0.337,0.405]",66
AAAS_28691,alpha,1,Islet2-fresh_ACATTGGCTGCAGCTACTAGAGTTGAC,0.3460007,"(0.337,0.405]",66
AAAS_28691,alpha,1,Islet2-fresh_GCTCTAAGACTCGCTACGTAAGATCCT,0.4034265,"(0.337,0.405]",66
AAAS_28691,alpha,1,Islet3-fresh_AGCGATAGATCTCAGGCTCTCTATTATAGCCT,0.3993824,"(0.337,0.405]",66


In [37]:
dat.pro_ps.long <- dat.pro_ps.long%>% 
    group_by(gene_tr.idx,celltype,ps_bin)%>%summarise(n= n())%>%
    left_join(ps.bin.dat.sum)%>%arrange(gene_tr.idx,ps_bin)%>%
    mutate(ps_bin.2 = as.numeric(ps_bin))%>% 
    mutate(percent_open=n/N*100)
head(dat.pro_ps.long)
dim(dat.pro_ps.long)
head(dat.pro_ps.long%>%filter(celltype=="beta"))

Joining, by = c("celltype", "ps_bin")


gene_tr.idx,celltype,ps_bin,n,N,ps_bin.2,percent_open
AAAS_28691,alpha,"(0.27,0.337]",2,34,5,5.882353
AAAS_28691,alpha,"(0.337,0.405]",6,66,6,9.090909
AAAS_28691,alpha,"(0.405,0.472]",19,172,7,11.046512
AAAS_28691,alpha,"(0.472,0.54]",25,257,8,9.727626
AAAS_28691,alpha,"(0.54,0.607]",28,212,9,13.207547
AAAS_28691,alpha,"(0.607,0.675]",12,73,10,16.438356


gene_tr.idx,celltype,ps_bin,n,N,ps_bin.2,percent_open
ABCB6_81071,beta,"(0.202,0.404]",1,6,102,16.666667
ABCB6_81071,beta,"(0.404,0.606]",1,15,103,6.666667
ABCB6_81071,beta,"(0.606,0.808]",5,35,104,14.285714
ABCB6_81071,beta,"(0.808,1.01]",8,86,105,9.302326
ABCB6_81071,beta,"(1.01,1.21]",15,191,106,7.853403
ABCB6_81071,beta,"(1.21,1.41]",21,326,107,6.441718


In [38]:
head(dat.pro_ps.long)
dim(dat.pro_ps.long)

gene_tr.idx,celltype,ps_bin,n,N,ps_bin.2,percent_open
AAAS_28691,alpha,"(0.27,0.337]",2,34,5,5.882353
AAAS_28691,alpha,"(0.337,0.405]",6,66,6,9.090909
AAAS_28691,alpha,"(0.405,0.472]",19,172,7,11.046512
AAAS_28691,alpha,"(0.472,0.54]",25,257,8,9.727626
AAAS_28691,alpha,"(0.54,0.607]",28,212,9,13.207547
AAAS_28691,alpha,"(0.607,0.675]",12,73,10,16.438356


In [39]:
dat.pro_ps.wide <- rbind(dat.pro_ps.long%>%
    filter(celltype=="alpha")%>%
    group_by(gene_tr.idx,celltype) %>%
    select(-one_of(c("ps_bin","n","N")))%>%
    spread(key = ps_bin.2,value = percent_open,fill = 0),
               dat.pro_ps.long%>%
    filter(celltype=="beta")%>%
    group_by(gene_tr.idx,celltype) %>%
    select(-one_of(c("ps_bin","n","N")))%>%
    mutate(ps_bin.2=ps_bin.2-100)%>%
    spread(key = ps_bin.2,value = percent_open,fill = 0))
head(dat.pro_ps.wide)
tail(dat.pro_ps.wide)
dim(dat.pro_ps.wide)

gene_tr.idx,celltype,1,2,3,4,5,6,7,8,⋯,91,92,93,94,95,96,97,98,99,100
AAAS_28691,alpha,0,0,0,0.0,5.882353,9.090909,11.04651,9.727626,⋯,11.51515,12.66667,16.82243,21.25,15.51724,22.58065,10.526316,0,16.66667,0
AAED1_127544,alpha,0,0,0,0.0,8.823529,18.181818,18.02326,14.396887,⋯,16.9697,22.66667,23.36449,26.25,29.31034,19.35484,26.315789,30,33.33333,0
AAGAB_42733,alpha,0,0,0,0.0,17.647059,6.060606,11.62791,11.673152,⋯,20.60606,19.33333,18.69159,20.0,27.58621,38.70968,31.578947,40,33.33333,0
AAMDC_22628,alpha,0,0,0,28.57143,23.529412,12.121212,16.27907,21.789883,⋯,26.66667,33.33333,28.03738,41.25,31.03448,32.25806,36.842105,60,50.0,50
AAMP_80793,alpha,0,100,0,14.28571,17.647059,12.121212,15.69767,15.953307,⋯,17.57576,22.66667,21.49533,40.0,31.03448,45.16129,5.263158,30,16.66667,0
AASDHPPT_23418,alpha,0,100,100,71.42857,29.411765,27.272727,39.53488,32.684825,⋯,32.72727,33.33333,36.4486,40.0,44.82759,38.70968,68.421053,60,66.66667,100


gene_tr.idx,celltype,1,2,3,4,5,6,7,8,⋯,91,92,93,94,95,96,97,98,99,100
ZNF846_65570,beta,0,0,0.0,2.857143,1.162791,1.570681,4.294479,3.249097,⋯,10.041841,8.5,13.953488,9.638554,12.068966,6.25,18.181818,0,0,40
ZP1_19678,beta,0,0,0.0,2.857143,4.651163,7.329843,5.521472,4.873646,⋯,2.09205,0.5,2.325581,1.204819,1.724138,0.0,0.0,0,0,0
ZSCAN22_73416,beta,0,0,0.0,11.428571,3.488372,4.188482,3.680982,3.971119,⋯,10.041841,10.0,16.27907,9.638554,5.172414,12.5,22.727273,20,20,0
ZSCAN32_46213,beta,0,0,6.666667,5.714286,6.976744,7.853403,7.055215,9.205776,⋯,11.715481,17.5,13.953488,15.662651,12.068966,12.5,18.181818,0,20,0
ZSWIM2_79591,beta,0,50,46.666667,45.714286,23.255814,24.60733,23.619632,15.703971,⋯,7.949791,10.0,3.875969,6.024096,8.62069,3.125,9.090909,0,20,0
ZSWIM5_3921,beta,0,0,0.0,11.428571,6.976744,9.947644,5.828221,7.761733,⋯,11.715481,17.5,20.155039,25.301205,13.793103,18.75,4.545455,10,20,20


In [40]:
# wide to long again to fill 

dat.pro_ps.long <- dat.pro_ps.wide%>%gather(key = "bin",value = "percent_open",3:102)
head(dat.pro_ps.long)
tail(dat.pro_ps.long)
dim(dat.pro_ps.long)

gene_tr.idx,celltype,bin,percent_open
AAAS_28691,alpha,1,0
AAED1_127544,alpha,1,0
AAGAB_42733,alpha,1,0
AAMDC_22628,alpha,1,0
AAMP_80793,alpha,1,0
AASDHPPT_23418,alpha,1,0


gene_tr.idx,celltype,bin,percent_open
ZNF846_65570,beta,100,40
ZP1_19678,beta,100,0
ZSCAN22_73416,beta,100,0
ZSCAN32_46213,beta,100,0
ZSWIM2_79591,beta,100,0
ZSWIM5_3921,beta,100,20


### 4.2 data smooth

In [41]:
require(parallel)
dat.pro_ps.smooth <- list()
system.time(
    for(ct in c("alpha","beta")){
    dat.sub <- subset(dat.pro_ps.long,celltype==ct)
    all.tr <- unique(dat.sub$gene_tr.idx)
    dat.pro_ps.smooth[[ct]] <- mclapply(all.tr,
                           function(g){data.frame(bin=1:100,
                                                  gene_tr.idx=g,
                                                  percent_open=predict(loess(percent_open~bin, data=dat.sub%>%filter(gene_tr.idx==g))),
                                                  celltype=ct)},mc.cores=12)
})


Loading required package: parallel


    user   system  elapsed 
1396.222   65.588  149.225 

In [46]:
#dat.pro_ps.smooth<- rbind(do.call(rbind,dat.pro_ps.smooth$alpha),
#     do.call(rbind,dat.pro_ps.smooth$beta))
head(dat.pro_ps.smooth)
dim(dat.pro_ps.smooth)

bin,gene_tr.idx,percent_open,celltype
1,AAAS_28691,4.931862,alpha
2,AAAS_28691,5.279896,alpha
3,AAAS_28691,5.615049,alpha
4,AAAS_28691,5.937014,alpha
5,AAAS_28691,6.245485,alpha
6,AAAS_28691,6.540155,alpha


In [47]:
fwrite(dat.pro_ps.smooth,file = "../dat/1901/alpha_beta.promoter.long_100bin_transcripts_ps_smoothed_dy.txt")
fwrite(dat.pro_ps.long,file = "../dat/1901/alpha_beta.promoter.long_100bin_transcripts_ps_raw_dy.txt")

In [1]:
?chisq.test