# link the promoter opening matrix with cell's pesudostates

In [3]:
require(data.table)
require(tidyverse)

### 1. load promoter's data 

from [summarize data at transcript level.ipynb](./summarize data at transcript level.ipynb)

In [4]:
dat.pro <- fread("gzcat ../dat/1910_v2//long_matrix_w_transcripts.txt.gz") %>% filter(!is.na(tr.idx))
dim(dat.pro)
head(dat.pro,1)
length(unique(dat.pro$cell))

seq,start,end,cell,value,tr.idx,gene
<chr>,<int>,<int>,<chr>,<int>,<int>,<chr>
chr10,100027284,100028604,Islet1fresh_AGACACCTAAGAGGCAGCGTAAGAAGGCG,2,9209,LOXL4


In [5]:
dat.pro <- dat.pro %>% mutate(seq = sub("chr", "", seq)) %>% mutate(prom_peak = paste0(seq, 
    ":", start, "-", end)) %>% select(prom_peak, cell) %>% distinct()
dim(dat.pro)
length(unique(dat.pro$cell))
dat.pro %>% head(1)

prom_peak,cell
<chr>,<chr>
10:100027284-100028604,Islet1fresh_AGACACCTAAGAGGCAGCGTAAGAAGGCG


### 2. load cell's pseduostates

In [6]:
dat.ps <- do.call(rbind, lapply(c("alpha", "beta", "delta"), function(x) fread(paste0("../dat/1910_v2/", 
    x, ".pseudotime.txt"), col.names = c("cell", "ps")) %>% mutate(celltype = x)))
head(dat.ps, 1)
tail(dat.ps, 1)
dim(dat.ps)
dat.ps %>% pull(celltype) %>% table
dat.ps %>% pull(celltype) %>% table %>% sum

“Detected 1 column names but the data has 2 columns (i.e. invalid file). Added 1 extra default column name for the first column which is guessed to be row names or an index. Use setnames() afterwards if this guess is not correct, or fix the file write command that created the file to create a valid file.”

cell,ps,celltype
<chr>,<dbl>,<chr>
Islet1fresh_AGACACCTAAGAGGCAAAGGAGTAGGCTC,13.98913,alpha


Unnamed: 0_level_0,cell,ps,celltype
Unnamed: 0_level_1,<chr>,<dbl>,<chr>
14526,Islet3fresh_TCCGGAGATGCAGCTACCTAGAGTGGTTGCGT,2.889196,delta


.
alpha  beta delta 
 6218  7598   710 

In [7]:
dat.pro_ps <- inner_join(dat.pro,dat.ps)
dim(dat.pro_ps)
head(dat.pro_ps,1)
length(unique(dat.pro_ps$cell))

Joining, by = "cell"


prom_peak,cell,ps,celltype
<chr>,<chr>,<dbl>,<chr>
10:100027284-100028604,Islet1fresh_AGACACCTAAGAGGCAGCGTAAGAAGGCG,11.8747,alpha


In [8]:
dat.pro_ps%>%select(cell,celltype)%>%distinct%>%pull(celltype)%>%table

.
alpha  beta delta 
 6218  7598   710 

In [9]:
fwrite(dat.pro_ps,"../dat/1910_v2/abd.promoter.long_matrix_w_transcripts_ps.txt")