In [2]:
require(data.table)
require(tidyverse)

## 1. Rationale

* due to the sparseness, the ps is binned to 100 (can change) 
* then chisq test wheter the ratio of openning is dependent on bins 

##  2. load data

In [3]:
dat.pro_ps<- fread("../dat/1901/alpha_beta.promoter.long_matrix_w_transcripts_ps.txt")%>%
    separate(cluster,into = c("celltype","subtype"))
head(dat.pro_ps)
tail(dat.pro_ps)
dim(dat.pro_ps)

gene_tr.idx,celltype,subtype,cell,ps
SAMD11_7,alpha,1,Islet1-fresh_AGACACCTAGGCAGAAGTAAGGAGCAGGA,5.643735
SAMD11_7,alpha,1,Islet1-fresh_AGACACCTATGCGCAGCGTCTAATGGTTG,1.056578
SAMD11_7,alpha,2,Islet1-fresh_AGACACCTCGAGGCTGAAGGCTATGGTTG,6.254863
SAMD11_7,alpha,2,Islet1-fresh_AGACACCTCGTACTAGCTAAGCCTGTACT,6.508574
SAMD11_7,alpha,1,Islet1-fresh_AGACACCTGGACTCCTTCGACTAGGGTTG,2.12067
SAMD11_7,alpha,2,Islet1-fresh_AGACACCTGGAGCTACAAGGAGTAAGGCG,5.564368


gene_tr.idx,celltype,subtype,cell,ps
EIF1AY_133795,beta,1,Islet3-fresh_TCCGGAGAGGAGCTACTCGACTAGGTACTGAC,3.157942
EIF1AY_133795,beta,1,Islet3-fresh_TCCGGAGAGTAGAGGACTAAGCCTGGCTCTGA,4.901059
EIF1AY_133795,beta,2,Islet3-fresh_TCCGGAGATACGCTGCAAGGCTATGGTTGCGT,18.923257
EIF1AY_133795,beta,1,Islet3-fresh_TCCGGAGATAGGCATGTTCTAGCTATAGAGGC,4.65522
EIF1AY_133795,beta,2,Islet3-fresh_TCCGGAGATCGACGTCCTCTCTATCAGGACGT,14.077267
EIF1AY_133795,beta,1,Islet3-fresh_TCCGGAGATCGACGTCCTCTCTATGGTTGCGT,8.113267


### 2.1 bin data 

In [4]:
require(tidyverse)
ps.range <- list()
ps.range$alpha <- range((dat.pro_ps %>%filter(celltype =="alpha"))$ps)
ps.range$beta <- range((dat.pro_ps %>%filter(celltype=="beta"))$ps)
ps.range

In [5]:
N.bin<- 101
ps.bks <- list()
ps.bks$alpha <- seq(ps.range$alpha[1],ps.range$alpha[2],length.out = N.bin)
ps.bks$beta <- seq(ps.range$beta[1],ps.range$beta[2],length.out = N.bin)

dat.pro_ps <- rbind(
    dat.pro_ps%>% 
    filter(celltype=="alpha")%>%
    mutate(ps_bin =cut(ps,breaks = ps.bks$alpha,include.lowest = T)),
    dat.pro_ps%>% 
    filter(celltype=="beta")%>%
    mutate(ps_bin =cut(ps,breaks = ps.bks$beta,include.lowest = T))
)

ps.bin.dat.sum<- dat.pro_ps%>%
select(-one_of("gene_tr.idx","ps"))%>%distinct()

setDT(ps.bin.dat.sum)
ps.bin.dat.sum<-ps.bin.dat.sum[, .(.N), by = .(ps_bin,celltype)]
cat(paste(N.bin,"# bins:\n"))
cat("quantiles of alpha cell number per bin:\n")
t(quantile((ps.bin.dat.sum%>%filter(celltype=="alpha"))$N))
cat("quantiles of beta cell number per bin:\n")
t(quantile((ps.bin.dat.sum%>%filter(celltype=="beta"))$N))

101 # bins:
quantiles of alpha cell number per bin:


0%,25%,50%,75%,100%
1,21,41,71.25,257


quantiles of beta cell number per bin:


0%,25%,50%,75%,100%
2,18.75,33.5,77,655


In [6]:
ps.bin.dat.sum%>%group_by(celltype)%>%summarise(tot=sum(N))

celltype,tot
alpha,5594
beta,7170


## 3. filter on glist

In [7]:
dy.glist<- readRDS('../dat/1901/dy.glist.variable.Rds')
all.tr <- unique(c(dy.glist$alpha_tr,dy.glist$beta_tr))
length(all.tr)
lapply(dy.glist,length)

In [8]:
#gene_tr.idx	cluster	
dim(dat.pro_ps)
dat.pro_ps.filtered <- rbind(
    dat.pro_ps%>% filter(gene_tr.idx%in% dy.glist$alpha_tr,celltype=="alpha"),
    dat.pro_ps%>% filter(gene_tr.idx%in% dy.glist$beta_tr,celltype=="beta")
)
dim(dat.pro_ps.filtered)

## 4. Calc percent_open & smooth

### 4.1 convert to percetage openning per bin

In [9]:
head(dat.pro_ps.filtered)

gene_tr.idx,celltype,subtype,cell,ps,ps_bin
HES4_34,alpha,1,Islet1-fresh_AGACACCTATCTCAGGTCGACTAGTATAG,0.5915286,"(0.54,0.607]"
HES4_34,alpha,2,Islet1-fresh_AGACACCTATGCGCAGCTCTCTATAGGCG,6.2034799,"(6.14,6.21]"
HES4_34,alpha,2,Islet1-fresh_AGACACCTCGAGGCTGAAGGCTATGGTTG,6.2548629,"(6.21,6.27]"
HES4_34,alpha,2,Islet1-fresh_AGACACCTCGAGGCTGGCGTAAGAGTACT,6.4780116,"(6.48,6.54]"
HES4_34,alpha,2,Islet1-fresh_AGACACCTCGATCAGTAAGGAGTAATAGA,4.720515,"(4.65,4.72]"
HES4_34,alpha,2,Islet1-fresh_AGACACCTCGTACTAGCTAAGCCTGTACT,6.5085744,"(6.48,6.54]"


In [10]:
dat.pro_ps.long <- dat.pro_ps.filtered%>%
    left_join(ps.bin.dat.sum)
dim(dat.pro_ps.long)
head(dat.pro_ps.long%>%arrange(gene_tr.idx,ps_bin))

Joining, by = c("celltype", "ps_bin")


gene_tr.idx,celltype,subtype,cell,ps,ps_bin,N
AAGAB_42733,alpha,1,Islet2-fresh_AGACGGAGTAGGCATGTAAGGAGTGAC,0.28742,"(0.27,0.337]",34
AAGAB_42733,alpha,1,Islet2-fresh_CAGTTGCACCTAAGACGTCTAATCTGA,0.2918434,"(0.27,0.337]",34
AAGAB_42733,alpha,1,Islet2-fresh_CAGTTGCATAGGCATGTCTGAGTTGAC,0.3206313,"(0.27,0.337]",34
AAGAB_42733,alpha,1,Islet3-fresh_CGCTCATTGCTCATGATATCCTCTGGCTCTGA,0.3131417,"(0.27,0.337]",34
AAGAB_42733,alpha,1,Islet3-fresh_TAATGCGCGCTCATGAAAGGCTATGGCTCTGA,0.2783736,"(0.27,0.337]",34
AAGAB_42733,alpha,1,Islet3-fresh_TCCGCGAACTCTCTACCCTAGAGTGGCTCTGA,0.3178772,"(0.27,0.337]",34


In [11]:
dat.pro_ps.long <- dat.pro_ps.long%>% 
    group_by(gene_tr.idx,celltype,ps_bin)%>%summarise(n= n())%>%
    left_join(ps.bin.dat.sum)%>%arrange(gene_tr.idx,ps_bin)%>%
    mutate(ps_bin.2 = as.numeric(ps_bin))%>% 
    mutate(percent_open=n/N*100)
head(dat.pro_ps.long)
dim(dat.pro_ps.long)
head(dat.pro_ps.long%>%filter(celltype=="beta"))

Joining, by = c("celltype", "ps_bin")


gene_tr.idx,celltype,ps_bin,n,N,ps_bin.2,percent_open
AAGAB_42733,alpha,"(0.27,0.337]",6,34,5,17.647059
AAGAB_42733,alpha,"(0.337,0.405]",4,66,6,6.060606
AAGAB_42733,alpha,"(0.405,0.472]",20,172,7,11.627907
AAGAB_42733,alpha,"(0.472,0.54]",30,257,8,11.673152
AAGAB_42733,alpha,"(0.54,0.607]",29,212,9,13.679245
AAGAB_42733,alpha,"(0.607,0.675]",12,73,10,16.438356


gene_tr.idx,celltype,ps_bin,n,N,ps_bin.2,percent_open
AASS_118936,beta,"[0,0.202]",1,2,101,50.0
AASS_118936,beta,"(0.202,0.404]",2,6,102,33.33333
AASS_118936,beta,"(0.404,0.606]",3,15,103,20.0
AASS_118936,beta,"(0.606,0.808]",17,35,104,48.57143
AASS_118936,beta,"(0.808,1.01]",19,86,105,22.09302
AASS_118936,beta,"(1.01,1.21]",34,191,106,17.80105


In [12]:
dat.pro_ps.wide <- rbind(dat.pro_ps.long%>%
    filter(celltype=="alpha")%>%
    group_by(gene_tr.idx,celltype) %>%
    select(-one_of(c("ps_bin","n","N")))%>%
    spread(key = ps_bin.2,value = percent_open,fill = 0),
               dat.pro_ps.long%>%
    filter(celltype=="beta")%>%
    group_by(gene_tr.idx,celltype) %>%
    select(-one_of(c("ps_bin","n","N")))%>%
    mutate(ps_bin.2=ps_bin.2-100)%>%
    spread(key = ps_bin.2,value = percent_open,fill = 0))
head(dat.pro_ps.wide)
tail(dat.pro_ps.wide)
dim(dat.pro_ps.wide)

gene_tr.idx,celltype,1,2,3,4,5,6,7,8,⋯,91,92,93,94,95,96,97,98,99,100
AAGAB_42733,alpha,0,0,0,0.0,17.647059,6.060606,11.627907,11.6731518,⋯,20.606061,19.333333,18.691589,20.0,27.586207,38.709677,31.57895,40,33.33333,0
ABCB9_32864,alpha,0,0,0,0.0,0.0,0.0,1.162791,0.3891051,⋯,3.636364,6.0,13.084112,3.75,12.068966,12.903226,15.78947,10,0.0,0
ABCC8_17965,alpha,0,0,0,14.28571,20.588235,22.727273,21.511628,24.5136187,⋯,2.424242,4.666667,1.869159,2.5,5.172414,3.225806,21.05263,0,0.0,0
ABCD3_5661,alpha,0,0,0,14.28571,23.529412,15.151515,19.186047,17.8988327,⋯,21.818182,28.666667,32.71028,23.75,18.965517,48.387097,52.63158,40,16.66667,50
ABCF2_120214,alpha,0,0,0,0.0,5.882353,7.575758,6.395349,6.2256809,⋯,9.69697,15.333333,18.691589,17.5,15.517241,22.580645,42.10526,30,0.0,50
ABHD2_44351,alpha,0,0,0,0.0,0.0,0.0,1.162791,0.3891051,⋯,4.848485,8.666667,12.149533,8.75,12.068966,12.903226,21.05263,0,0.0,50


gene_tr.idx,celltype,1,2,3,4,5,6,7,8,⋯,91,92,93,94,95,96,97,98,99,100
ZNF841_72189,beta,0,0.0,0.0,5.714286,3.488372,2.094241,2.453988,2.5270758,⋯,5.020921,7.5,6.976744,4.819277,8.62069,18.75,9.090909,10,0,0
ZSCAN22_73416,beta,0,0.0,0.0,11.428571,3.488372,4.188482,3.680982,3.9711191,⋯,10.041841,10.0,16.27907,9.638554,5.172414,12.5,22.727273,20,20,0
ZSCAN5A_73073,beta,0,0.0,6.666667,0.0,4.651163,5.235602,3.374233,5.5956679,⋯,7.949791,8.0,6.976744,13.253012,5.172414,9.375,9.090909,30,0,0
ZSWIM2_79591,beta,0,50.0,46.666667,45.714286,23.255814,24.60733,23.619632,15.7039711,⋯,7.949791,10.0,3.875969,6.024096,8.62069,3.125,9.090909,0,20,0
ZUFSP_112980,beta,50,16.66667,13.333333,25.714286,15.116279,9.947644,8.588957,7.400722,⋯,4.60251,8.0,6.976744,13.253012,6.896552,9.375,9.090909,0,20,20
ZZEF1_52563,beta,0,0.0,0.0,0.0,0.0,0.0,0.0,0.3610108,⋯,1.67364,1.5,1.550388,2.409639,0.0,6.25,9.090909,0,0,0


In [13]:
# wide to long again to fill 

dat.pro_ps.long <- dat.pro_ps.wide%>%gather(key = "bin",value = "percent_open",3:102)
head(dat.pro_ps.long)
tail(dat.pro_ps.long)
dim(dat.pro_ps.long)

gene_tr.idx,celltype,bin,percent_open
AAGAB_42733,alpha,1,0
ABCB9_32864,alpha,1,0
ABCC8_17965,alpha,1,0
ABCD3_5661,alpha,1,0
ABCF2_120214,alpha,1,0
ABHD2_44351,alpha,1,0


gene_tr.idx,celltype,bin,percent_open
ZNF841_72189,beta,100,0
ZSCAN22_73416,beta,100,0
ZSCAN5A_73073,beta,100,0
ZSWIM2_79591,beta,100,0
ZUFSP_112980,beta,100,20
ZZEF1_52563,beta,100,0


### 4.2 data smooth

In [14]:
require(parallel)
dat.pro_ps.smooth <- list()
system.time(
    for(ct in c("alpha","beta")){
    dat.sub <- subset(dat.pro_ps.long,celltype==ct)
    all.tr <- unique(dat.sub$gene_tr.idx)
    dat.pro_ps.smooth[[ct]] <- mclapply(all.tr,
                           function(g){data.frame(bin=1:100,
                                                  gene_tr.idx=g,
                                                  percent_open=predict(loess(percent_open~bin, data=dat.sub%>%filter(gene_tr.idx==g))),
                                                  celltype=ct)},mc.cores=12)
})


Loading required package: parallel


   user  system elapsed 
 69.580  32.862  20.436 

In [25]:
dat.pro_ps.smooth<- rbind(do.call(rbind,dat.pro_ps.smooth$alpha),do.call(rbind,dat.pro_ps.smooth$beta))

In [26]:
fwrite(dat.pro_ps.smooth,file = "../dat/1901/alpha_beta.promoter.long_100bin_transcripts_ps_smoothed_dy_varible.txt")
fwrite(dat.pro_ps.long,file = "../dat/1901/alpha_beta.promoter.long_100bin_transcripts_ps_raw_dy_varible.txt")