## 1. Data manipulate

The goal is to add a `transcript` column into overlap res. Each `transcript` has a unique `tss`. 

- use [foverlap](https://www.rdocumentation.org/packages/data.table/versions/1.11.8/topics/foverlaps) function to get `peak` overlapping with `TSS` regions.  

### 1.1 load all transcripts

In [3]:
require(data.table)
require(tidyverse)
#require(rtracklayer)

all.transcript <- fread('../dat/1910_v2/gencode.v19.1kb_all_possible_transcripts.bed',
                       col.names=c("seq",'start','end','gene'))
setkey(all.transcript,seq,start,end) # this step changed order 
all.transcript%>%head(1)
all.transcript%>%tail(1)
dim(all.transcript)

seq,start,end,gene
<chr>,<int>,<int>,<chr>
chr1,68591,69591,OR4F5


seq,start,end,gene
<chr>,<int>,<int>,<chr>
chrY,59329877,59330877,IL9R


In [10]:
system('tail -n1 ../dat/1910_v2/gencode.v19.1kb_all_possible_transcripts.bed',intern = T)%>%cat

chrM	14247	15247	MT-CYB

**fread changed order of rows (reordered based on seq) !!!**
- because set key 

In [4]:
saveRDS(all.transcript,'../dat/1910_v2/gencode.v19.1kb_all_possible_transcripts.Rds')

In [58]:
all.transcript%>% rowid_to_colum( 

seq,start,end,gene
<chr>,<int>,<int>,<chr>
chr1,68591,69591,OR4F5
chr1,138879,139879,AL627309.1
chr1,367140,368140,OR4F29
chr1,621553,622553,OR4F16
chr1,738637,739637,AL669831.1
chr1,817543,818543,AL645608.2


### 1.2 foverlap peaks/reads with TSS 

In [12]:
overlap.res <- fread('gzcat ../dat/1910_v2/islet.lf_mtx.gz')%>%
    separate(peak,into = c("seq","start","end"),sep = ":|-")%>%
    mutate(seq=paste0('chr',seq))
setDT(overlap.res)
overlap.res <-overlap.res[,c("start","end"):=lapply(.SD, as.integer), .SDcols = c("start","end")]
head(overlap.res,1)
dim(overlap.res)

seq,start,end,cell,value
<chr>,<dbl>,<dbl>,<chr>,<int>
chr10,100005281,100005858,Islet1fresh_AGACACCTCGATCAGTACTGCATAGGCTC,2


In [43]:
foverlap.res <- data.table::foverlaps(x = overlap.res,y = all.transcript,
                 type="any", which=T,mult="first") #only choose 1st overlap
range(foverlap.res,na.rm = T)

In [46]:
sum(!is.na(foverlap.res))
sum(is.na(foverlap.res))
sum(!is.na(foverlap.res))+sum(is.na(foverlap.res)) 

In [47]:
overlap.res <- overlap.res%>%mutate(transcript.idx=foverlap.res)
head(overlap.res,1)
dim(overlap.res)

seq,start,end,cell,value,transcript.idx
<chr>,<int>,<int>,<chr>,<int>,<int>
chr1,752578,752778,Islet1fresh_CTGAAGCTTAAGGCGATCGACTAGGTACT,2,


## 5. filtering using umap's results

In [49]:
dat.all.cells <- fread('../dat/1910_v2/islet.cluster_labels.filt.txt')
head(dat.all.cells,1)
length(dat.all.cells$index)

index,UMAP1,UMAP2,cluster_name,unique_usable_reads,log_usable_counts,frac_duplicated_reads,frac_mito_reads,frac_promoters_used,frac_reads_in_peaks,frac_reads_in_promoters
<chr>,<dbl>,<dbl>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Islet1fresh_AGACACCTAAGAGGCAAAGGAGTAGGCTC,-5.102393,1.38682,alpha_2,6254,7.865955,0.7902986,0.03428042,0.04337516,0.5978574,0.2646306


In [50]:
overlap.res <- overlap.res%>% filter(cell %in% dat.all.cells$index)
dim(overlap.res)
length(unique(overlap.res$cell))

##  save

In [59]:
fwrite(overlap.res,'../dat/1910_v2/long_matrix_w_transcripts.txt')

### check idx

`(base) ➜  1901 git:(master) ✗ grep ABHD14A alpha_beta.promoter.long_matrix_w_transcripts_corrected.txt |awk -v FS=',' '{print $1,$2,$3,$4,$7}'| sort|uniq`

chr3 52007627 52009641 ABHD14A 92628

chr3 52007627 52009641 ABHD14A-ACY1 92628

`(base) ➜  1901 git:(master) ✗ grep ABHD14A alpha_beta.promoter.long_matrix_w_transcripts.txt |awk -v FS=',' '{print $1,$2,$3,$4,$7}'| sort|uniq`

chr3 52007627 52009641 ABHD14A 92628

chr3 52007627 52009641 ABHD14A-ACY1 92628

In [51]:
all.transcript[81744,]

seq,start,end,gene
<chr>,<int>,<int>,<chr>
chrY,22737111,22738111,EIF1AY


In [56]:
system('sed -n 81744p ../dat/1910_v2/gencode.v19.1kb_all_possible_transcripts.bed',intern = T)%>%cat

chrY	23672758	23673758	RBMY1A1