In [2]:
require(data.table)
require(tidyverse)

## 1. Rationale

* due to the sparseness, the ps is binned to 100 (can change) 
* then chisq test wheter the ratio of openning is dependent on bins 

##  2. load data

In [4]:
dat.pro_ps<- fread("../dat/1910_v2/abd.promoter.long_matrix_w_transcripts_ps.txt")
head(dat.pro_ps,1)
tail(dat.pro_ps,1)
dim(dat.pro_ps)

prom_peak,cell,ps,celltype
<chr>,<chr>,<dbl>,<chr>
10:100027284-100028604,Islet1fresh_AGACACCTAAGAGGCAGCGTAAGAAGGCG,11.8747,alpha


prom_peak,cell,ps,celltype
<chr>,<chr>,<dbl>,<chr>
Y:6777959-6779589,Islet3fresh_TCCGGAGATGCAGCTAATCTGAGTAGGCGAAG,12.20114,beta


## 3 bin ps to get fraction of state 1 cells

### 3.1 100 bins for a,b; 20 bins for d

In [5]:
ps.range <- lapply(c("alpha", "beta", "delta"), function(ct) dat.pro_ps %>% filter(celltype == 
    ct) %>% pull(ps) %>% range)
names(ps.range) <- c("alpha", "beta", "delta")
ps.range

In [6]:
N.bin <- 101
N.bin.d<- 21
ps.bks <- list()
ps.bks$alpha <- seq(ps.range$alpha[1], ps.range$alpha[2] + 1e-05, length.out = N.bin)
ps.bks$beta <- seq(ps.range$beta[1], ps.range$beta[2] + 1e-05, length.out = N.bin)
ps.bks$delta <- seq(ps.range$delta[1], ps.range$delta[2] + 1e-05, length.out = N.bin.d)

dat.pro_ps <- rbind(dat.pro_ps %>% filter(celltype == "alpha") %>% mutate(ps_bin = cut(ps, 
    breaks = ps.bks$alpha, include.lowest = T, right = F)) %>% mutate(ps_bin_idx = as.numeric(ps_bin)), 
    dat.pro_ps %>% filter(celltype == "beta") %>% mutate(ps_bin = cut(ps, breaks = ps.bks$beta, 
        include.lowest = T, right = F)) %>% mutate(ps_bin_idx = as.numeric(ps_bin)), 
    dat.pro_ps %>% filter(celltype == "delta") %>% mutate(ps_bin = cut(ps, breaks = ps.bks$delta, 
        include.lowest = T, right = F)) %>% mutate(ps_bin_idx = as.numeric(ps_bin))) %>% 
    arrange(celltype, ps_bin_idx)

dat.pro_ps %>% head(1)

prom_peak,cell,ps,celltype,ps_bin,ps_bin_idx
<chr>,<chr>,<dbl>,<chr>,<fct>,<dbl>
10:100027284-100028604,Islet3fresh_AGACACCTACTGAGCGACTGCATAGGTTGCGT,0.1583683,alpha,"[0,0.209)",1


In [12]:
fread('../dat/1910_v2/islet.cluster_labels.filt.txt')%>%head(1)
fread('../dat/1910_v2/islet.cluster_labels.filt.txt')%>%nrow

index,UMAP1,UMAP2,cluster_name,unique_usable_reads,log_usable_counts,frac_duplicated_reads,frac_mito_reads,frac_promoters_used,frac_reads_in_peaks,frac_reads_in_promoters
<chr>,<dbl>,<dbl>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Islet1fresh_AGACACCTAAGAGGCAAAGGAGTAGGCTC,-5.102393,1.38682,alpha_2,6254,7.865955,0.7902986,0.03428042,0.04337516,0.5978574,0.2646306


In [8]:
ps.bin.dat.sum <- dat.pro_ps %>% select(-prom_peak, -ps) %>% distinct() %>% select(-celltype) %>% 
    left_join(fread("../dat/1910_v2/islet.cluster_labels.filt.txt") %>% select(index, 
        cluster_name), by = c(cell = "index")) %>% separate(cluster_name, into = c("celltype", 
    "subtype"), sep = "_")
ps.bin.dat.sum %>% head(1)
ps.bin.dat.sum %>% nrow

setDT(ps.bin.dat.sum)
ps.bin.dat.sum <- ps.bin.dat.sum %>% unite("ps_bin", 3:2, sep = ":", )
ps.bin.dat.sum <- ps.bin.dat.sum[, .(.N), by = .(ps_bin, celltype, subtype)]
ps.bin.dat.sum <- (ps.bin.dat.sum %>% spread(key = "subtype", value = "N", fill = 0) %>% 
    mutate(N = (`1` + `2`), frac = `1`/(`1` + `2`)))
ps.bin.dat.sum %>% group_by(celltype) %>% summarise(tot = sum(N))
ps.bin.dat.sum %>% head(1)
ps.bin.dat.sum %>% nrow

cell,ps_bin,ps_bin_idx,celltype,subtype
<chr>,<fct>,<dbl>,<chr>,<chr>
Islet3fresh_AGACACCTACTGAGCGACTGCATAGGTTGCGT,"[0,0.209)",1,alpha,1


celltype,tot
<chr>,<dbl>
alpha,6218
beta,7598
delta,710


ps_bin,celltype,1,2,N,frac
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
"1:[0,0.185)",beta,103,1,104,0.9903846


In [9]:
ps.bin.dat.sum <- ps.bin.dat.sum %>% separate(ps_bin, into = c("ps_bin_idx", "ps_bin"), 
    sep = ":")
fwrite(ps.bin.dat.sum, "../dat/1910_v2/ps.bin.dat.sum.csv")

In [9]:
setDT(ps.bin.dat.sum)
ps.bin.dat.sum <- ps.bin.dat.sum[, .(.N), by = .(ps_bin, celltype)]
ps.bin.dat.sum%>%head(1)
ps.bin.dat.sum%>%nrow
cat(paste(N.bin, "# bins:\n"))
cat("quantiles of alpha cell number per bin:\n")
t(quantile((ps.bin.dat.sum %>% filter(celltype == "alpha"))$N))
cat("quantiles of beta cell number per bin:\n")
t(quantile((ps.bin.dat.sum %>% filter(celltype == "beta"))$N))
cat("quantiles of delta cell number per bin:\n")
t(quantile((ps.bin.dat.sum %>% filter(celltype == "delta"))$N))

ps_bin,celltype,N
<fct>,<chr>,<int>
"(8.97,9.18]",alpha,211


101 # bins:
quantiles of alpha cell number per bin:


0%,25%,50%,75%,100%
1,15.5,39,71.5,658


quantiles of beta cell number per bin:


0%,25%,50%,75%,100%
2,29,56,94.25,555


quantiles of delta cell number per bin:


0%,25%,50%,75%,100%
1,4,7.5,12.75,55


### 3.2 Delta need bigger bin size
Or check `05a.Dynamical_data_prepare.ipynb`

In [12]:
N.bin <- 21
ps.bks <- list()
ps.bks$delta <- seq(ps.range$delta[1], ps.range$delta[2] + 1e-05, length.out = N.bin)

dat.pro_ps.d <- dat.pro_ps %>% filter(celltype == "delta") %>% mutate(ps_bin = cut(ps, 
    breaks = ps.bks$delta, include.lowest = T, right = F)) %>% mutate(ps_bin_idx = as.numeric(ps_bin)) %>% 
    arrange(celltype, ps_bin_idx)

dat.pro_ps.d %>% head(1)


ps.bin.dat.sum <- dat.pro_ps.d %>% select(-prom_peak, -ps) %>% distinct()
ps.bin.dat.sum %>% head(1)
ps.bin.dat.sum %>% nrow


setDT(ps.bin.dat.sum)
ps.bin.dat.sum <- ps.bin.dat.sum[, .(.N), by = .(ps_bin, celltype)]
ps.bin.dat.sum %>% head(1)
ps.bin.dat.sum %>% nrow
cat(paste(N.bin, "# bins:\n"))
cat("quantiles of delta cell number per bin:\n")
t(quantile((ps.bin.dat.sum %>% filter(celltype == "delta"))$N))

prom_peak,cell,ps,celltype,ps_bin,ps_bin_idx
<chr>,<chr>,<dbl>,<chr>,<fct>,<dbl>
10:101088824-101090586,Islet2fresh_TATCGGTTGGAGCTACCTGCATATGAC,0.05524501,delta,"[0,0.749)",1


cell,celltype,ps_bin,ps_bin_idx
<chr>,<chr>,<fct>,<dbl>
Islet2fresh_TATCGGTTGGAGCTACCTGCATATGAC,delta,"[0,0.749)",1


ps_bin,celltype,N
<fct>,<chr>,<int>
"[0,0.749)",delta,51


21 # bins:
quantiles of delta cell number per bin:


0%,25%,50%,75%,100%
7,19,34.5,48.75,92
