## 1. Data manipulate

The goal is to add a `transcript` column into overlap res. Each `transcript` has a unique `tss`. 

- use [foverlap](https://www.rdocumentation.org/packages/data.table/versions/1.11.8/topics/foverlaps) function to get `peak` overlapping with `TSS` regions.  

### 1.1 load all transcripts

In [3]:
require(data.table)
require(tidyverse)

In [3]:

#require(rtracklayer)

all.transcript <- fread('../dat/1910_v2/gencode.v19.1kb_all_possible_transcripts.bed',
                       col.names=c("seq",'start','end','gene'))
setkey(all.transcript,seq,start,end) # this step changed order 
all.transcript%>%head(1)
all.transcript%>%tail(1)
dim(all.transcript)

seq,start,end,gene
<chr>,<int>,<int>,<chr>
chr1,68591,69591,OR4F5


seq,start,end,gene
<chr>,<int>,<int>,<chr>
chrY,59329877,59330877,IL9R


In [4]:
system('tail -n1 ../dat/1910_v2/gencode.v19.1kb_all_possible_transcripts.bed',intern = T)%>%cat

chrM	14247	15247	MT-CYB

**fread changed order of rows (reordered based on seq) !!!**
- because set key 

In [4]:
saveRDS(all.transcript,'../dat/1910_v2/gencode.v19.1kb_all_possible_transcripts.Rds')

In [3]:
all.transcript<- readRDS('../dat/1910_v2/gencode.v19.1kb_all_possible_transcripts.Rds')

In [5]:
all.transcript%>%head(1)
all.transcript%>%tail(1)

seq,start,end,gene
<chr>,<int>,<int>,<chr>
chr1,68591,69591,OR4F5


seq,start,end,gene
<chr>,<int>,<int>,<chr>
chrY,59329877,59330877,IL9R


In [6]:
all.transcript%>% rowid_to_column("tr.idx")%>%tail(1)

tr.idx,seq,start,end,gene
<int>,<chr>,<int>,<int>,<chr>
81814,chrY,59329877,59330877,IL9R


### 1.2 foverlap peaks/reads with TSS 

In [7]:
overlap.res <- fread('gzcat ../dat/1910_v2/islet.lf_mtx.gz')%>%
    separate(peak,into = c("seq","start","end"),sep = ":|-")%>%
    mutate(seq=paste0('chr',seq))
setDT(overlap.res)
overlap.res <-overlap.res[,c("start","end"):=lapply(.SD, as.integer), .SDcols = c("start","end")]
head(overlap.res,1)
dim(overlap.res)

seq,start,end,cell,value
<chr>,<int>,<int>,<chr>,<int>
chr10,100005281,100005858,Islet1fresh_AGACACCTCGATCAGTACTGCATAGGCTC,2


####  foverlap only return first match tr and get dist vs prom peaks

In [13]:
foverlap.res <- data.table::foverlaps(x = overlap.res,y = all.transcript,
                 type="any", which=T,mult="first") #only choose 1st overlap
range(foverlap.res,na.rm = T)

In [10]:
sum(!is.na(foverlap.res))
sum(is.na(foverlap.res))
sum(!is.na(foverlap.res))+sum(is.na(foverlap.res)) 

In [16]:
overlap.res %>% mutate(tr.idx = foverlap.res) %>% left_join(all.transcript %>% 
    rowid_to_column("tr.idx"),by='tr.idx')%>%filter(!is.na(tr.idx))%>%head()

seq.x,start.x,end.x,cell,value,tr.idx,seq.y,start.y,end.y,gene
chr10,100027284,100028604,Islet1fresh_AGACACCTAAGAGGCAGCGTAAGAAGGCG,2,9209,chr10,100027507,100028507,LOXL4
chr10,100027284,100028604,Islet1fresh_AGACACCTACTCGCTAGTAAGGAGCAGGA,2,9209,chr10,100027507,100028507,LOXL4
chr10,100027284,100028604,Islet1fresh_AGACACCTACTGAGCGATCTGAGTGTACT,2,9209,chr10,100027507,100028507,LOXL4
chr10,100027284,100028604,Islet1fresh_AGACACCTATCTCAGGTATCCTCTCCTAT,2,9209,chr10,100027507,100028507,LOXL4
chr10,100027284,100028604,Islet1fresh_AGACACCTATGCGCAGAAGGAGTAGTACT,1,9209,chr10,100027507,100028507,LOXL4
chr10,100027284,100028604,Islet1fresh_AGACACCTCGAGGCTGCTCTCTATGGTTG,2,9209,chr10,100027507,100028507,LOXL4


In [18]:
overlap.res <- overlap.res %>% mutate(tr.idx = foverlap.res) %>% left_join(all.transcript %>% 
    rowid_to_column("tr.idx") %>% select(tr.idx, gene), by = "tr.idx")
head(overlap.res, 1)
dim(overlap.res)

seq,start,end,cell,value,tr.idx,gene
chr10,100005281,100005858,Islet1fresh_AGACACCTCGATCAGTACTGCATAGGCTC,2,,


#### foverlap return all matched trs and get prom_peak-tr dic

In [14]:
foverlap.res <- data.table::foverlaps(x = overlap.res, y = all.transcript, type = "any", 
    which = T)  #only choose 1st overlap
foverlap.res%>%head(5)
range(foverlap.res$yid, na.rm = T)
sum(!is.na(foverlap.res$yid))
sum(is.na(foverlap.res$yid))
sum(!is.na(foverlap.res$yid)) + sum(is.na(foverlap.res$yid))

xid,yid
<int>,<int>
1,
2,
3,
4,
5,


In [15]:
overlap.res%>%head(1)

seq,start,end,cell,value
<chr>,<int>,<int>,<chr>,<int>
chr10,100005281,100005858,Islet1fresh_AGACACCTCGATCAGTACTGCATAGGCTC,2


In [16]:
peak_tr.dic <- overlap.res %>% rowid_to_column("xid") %>% left_join(foverlap.res) %>% 
    left_join(all.transcript %>% rowid_to_column("yid") %>% select(yid, gene))

Joining, by = "xid"
Joining, by = "yid"


In [17]:
peak_tr.dic%>%head(1)
peak_tr.dic%>%dim

xid,seq,start,end,cell,value,yid,gene
<int>,<chr>,<int>,<int>,<chr>,<int>,<int>,<chr>
1,chr10,100005281,100005858,Islet1fresh_AGACACCTCGATCAGTACTGCATAGGCTC,2,,


In [21]:
peak_tr.dic <- peak_tr.dic %>% select(seq, start, end, yid, gene) %>% filter(!is.na(yid)) %>% 
    distinct %>% mutate(seq = sub("chr", "", seq)) %>% mutate(prom_peak = paste0(seq, 
    ":", start, "-", end)) %>% select(prom_peak, gene)
peak_tr.dic %>% head(1)
peak_tr.dic %>% dim
peak_tr.dic %>% pull(prom_peak) %>% unique %>% length

prom_peak,gene
<chr>,<chr>
10:100027284-100028604,LOXL4


In [5]:
peak_tr.dic<- peak_tr.dic%>%distinct
peak_tr.dic%>%dim

In [6]:
#fwrite(peak_tr.dic,'../dat/1910_v2/peak_tr.dic.txt')
saveRDS(peak_tr.dic,'../dat/1910_v2/peak_tr.dic.rds')

## 5. filtering using umap's results

In [49]:
dat.all.cells <- fread('../dat/1910_v2/islet.cluster_labels.filt.txt')
head(dat.all.cells,1)
length(dat.all.cells$index)

index,UMAP1,UMAP2,cluster_name,unique_usable_reads,log_usable_counts,frac_duplicated_reads,frac_mito_reads,frac_promoters_used,frac_reads_in_peaks,frac_reads_in_promoters
<chr>,<dbl>,<dbl>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Islet1fresh_AGACACCTAAGAGGCAAAGGAGTAGGCTC,-5.102393,1.38682,alpha_2,6254,7.865955,0.7902986,0.03428042,0.04337516,0.5978574,0.2646306


In [50]:
overlap.res <- overlap.res%>% filter(cell %in% dat.all.cells$index)
dim(overlap.res)
length(unique(overlap.res$cell))

##  save

In [19]:
fwrite(overlap.res,'../dat/1910_v2/long_matrix_w_transcripts.txt')

### check idx

`(base) ➜  1901 git:(master) ✗ grep ABHD14A alpha_beta.promoter.long_matrix_w_transcripts_corrected.txt |awk -v FS=',' '{print $1,$2,$3,$4,$7}'| sort|uniq`

chr3 52007627 52009641 ABHD14A 92628

chr3 52007627 52009641 ABHD14A-ACY1 92628

`(base) ➜  1901 git:(master) ✗ grep ABHD14A alpha_beta.promoter.long_matrix_w_transcripts.txt |awk -v FS=',' '{print $1,$2,$3,$4,$7}'| sort|uniq`

chr3 52007627 52009641 ABHD14A 92628

chr3 52007627 52009641 ABHD14A-ACY1 92628

In [51]:
all.transcript[81744,]

seq,start,end,gene
<chr>,<int>,<int>,<chr>
chrY,22737111,22738111,EIF1AY


In [56]:
system('sed -n 81744p ../dat/1910_v2/gencode.v19.1kb_all_possible_transcripts.bed',intern = T)%>%cat

chrY	23672758	23673758	RBMY1A1