## 1. Data manipulate

The goal is to add a `transcript` column into overlap res. Each `transcript` has a unique `tss`. 

- use [foverlap](https://www.rdocumentation.org/packages/data.table/versions/1.11.8/topics/foverlaps) function to get `peak` overlapping with `TSS` regions.  

### 1.1 load all transcripts

In [2]:
require(data.table)
#require(rtracklayer)
#all.transcript <- import.bed('../dat/1901/gencode.v19.1kb_all_possible_transcripts.bed')
all.transcript <- fread('../dat/1901/gencode.v19.1kb_all_possible_transcripts.bed',
                       col.names=c("seq",'start','end','gene'))
setkey(all.transcript,seq,start,end) # this step changed order 
head(all.transcript)
tail(all.transcript)
dim(all.transcript)

seq,start,end,gene
chr1,68591,69591,OR4F5
chr1,138879,139879,AL627309.1
chr1,367140,368140,OR4F29
chr1,621553,622553,OR4F16
chr1,738637,739637,AL669831.1
chr1,817543,818543,AL645608.2


seq,start,end,gene
chrY,59213544,59214544,VAMP7
chrY,59213547,59214547,VAMP7
chrY,59329752,59330752,IL9R
chrY,59329877,59330877,IL9R
chrY,59329905,59330905,IL9R
chrY,59338285,59339285,IL9R


In [3]:
all.transcript[133453,]
all.transcript[42733,]

seq,start,end,gene
chrX,153718453,153719453,SLC10A3


seq,start,end,gene
chr15,67546492,67547492,AAGAB


In [5]:
saveRDS(all.transcript,'../dat/1901/gencode.v19.1kb_all_possible_transcripts.Rds')

### fread changed order of rows (reordered based on seq) !!!
- because set key 

## 2. Alpha cells

In [3]:
require(tidyverse)
overlap.res <- fread('../dat/1901/alpha.promoter.long_matrix.txt')%>%
    separate(peak,into = c("seq","start","end"),sep = ":|-")%>%
    mutate(seq=paste0('chr',seq))
setDT(overlap.res)
overlap.res <-overlap.res[,c("start","end"):=lapply(.SD, as.numeric), .SDcols = c("start","end")]
head(overlap.res)
dim(overlap.res)

seq,start,end,gene,cluster,cell,overlap
chr1,859052,860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTAGGCAGAAGTAAGGAGCAGGA,1
chr1,859052,860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTATGCGCAGCGTCTAATGGTTG,1
chr1,859052,860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTCGAGGCTGAAGGCTATGGTTG,1
chr1,859052,860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTCGTACTAGCTAAGCCTGTACT,1
chr1,859052,860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTGGACTCCTTCGACTAGGGTTG,1
chr1,859052,860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTGGAGCTACAAGGAGTAAGGCG,1


In [4]:
foverlap.res <- foverlaps(x = overlap.res%>%select(-gene),y = all.transcript,
                 by.x=c("seq", "start", "end"),
                 type="any", which=T,mult="first") #only choose 1st overlap
range(foverlap.res)

In [5]:
overlap.res <- (overlap.res%>%mutate(transcript.idx=foverlap.res)%>%select(-overlap))
head(overlap.res)
dim(overlap.res)

seq,start,end,gene,cluster,cell,transcript.idx
chr1,859052,860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTAGGCAGAAGTAAGGAGCAGGA,7
chr1,859052,860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTATGCGCAGCGTCTAATGGTTG,7
chr1,859052,860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTCGAGGCTGAAGGCTATGGTTG,7
chr1,859052,860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTCGTACTAGCTAAGCCTGTACT,7
chr1,859052,860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTGGACTCCTTCGACTAGGGTTG,7
chr1,859052,860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTGGAGCTACAAGGAGTAAGGCG,7


## 3. Repeated for beta

In [6]:
overlap.res.beta <- fread('../dat/1901/beta.promoter.long_matrix.txt')%>%
    separate(peak,into = c("seq","start","end"),sep = ":|-")%>%
    mutate(seq=paste0('chr',seq))
setDT(overlap.res.beta)
overlap.res.beta <-overlap.res.beta[,c("start","end"):=lapply(.SD, as.numeric), .SDcols = c("start","end")]


overlap.res.beta <- overlap.res.beta%>%
    mutate(transcript.idx=foverlaps(x = overlap.res.beta%>%select(-gene),y = all.transcript,
                 by.x=c("seq", "start", "end"),
                 type="any", which=T,mult="first"))%>%
    select(-overlap)

##
head(overlap.res.beta)
dim(overlap.res.beta)

seq,start,end,gene,cluster,cell,transcript.idx
chr1,859052,860562,SAMD11,beta_1,Islet1-fresh_AGACACCTACTCGCTATCGACTAGATAGA,7
chr1,859052,860562,SAMD11,beta_2,Islet1-fresh_AGACACCTACTCGCTATTCTAGCTGGTTG,7
chr1,859052,860562,SAMD11,beta_1,Islet1-fresh_AGACACCTAGGCAGAATCTCTCCGAGGCG,7
chr1,859052,860562,SAMD11,beta_1,Islet1-fresh_AGACACCTCGTACTAGAAGGAGTACAGGA,7
chr1,859052,860562,SAMD11,beta_1,Islet1-fresh_AGACACCTCTCTCTACACTGCATAAGGCG,7
chr1,859052,860562,SAMD11,beta_2,Islet1-fresh_AGACACCTCTCTCTACTATCCTCTTATAG,7


In [7]:
range(overlap.res.beta$transcript.idx)
dim(all.transcript)

## 4. Combine alpha and beta 

In [8]:
overlap.res <-rbind(overlap.res,overlap.res.beta)

In [9]:
dim(overlap.res)
length(unique(overlap.res$cell))

## 5. filtering using umap's results

In [10]:
dat.all.cells <- fread('../dat/output.umap.ab.filtered.csv')
head(dat.all.cells)
length(dat.all.cells$barcodes)

barcodes,UMAP1,UMAP2,cluster,cell_type_overall,subtype,log10_n_counts,log10_n_peaks,Islet1,Islet2,Islet3
Islet3-fresh_CTGAAGCTTGCAGCTACTCTCTATTATAGCCT,6.029433,-3.42981782,beta_2,beta,2,4.520863,4.152013,0,0,1
Islet3-fresh_TCCGCGAACGAGGCTGCCTAGAGTGTACTGAC,-10.054582,-0.77572841,alpha_1,alpha,1,4.095657,3.758761,0,0,1
Islet3-fresh_CGGCTATGAAGAGGCAGTAAGGAGCCTATCCT,6.807724,-4.28113995,beta_1,beta,1,3.953856,3.622214,0,0,1
Islet3-fresh_CGGCTATGGCGTAGTAAAGGAGTATATAGCCT,-10.170747,-2.35448799,alpha_1,alpha,1,4.025879,3.694781,0,0,1
Islet3-fresh_CTGAAGCTTAGGCATGCCTAGAGTGGTTGCGT,-11.246461,0.03809572,alpha_1,alpha,1,4.470131,4.087994,0,0,1
Islet3-fresh_GAGATTCCCGAGGCTGAAGGCTATAGGCGAAG,-11.169406,-0.60920586,alpha_1,alpha,1,4.115577,3.780821,0,0,1


In [11]:
overlap.res <- overlap.res%>% filter(cell %in% dat.all.cells$barcodes)
dim(overlap.res)
length(unique(overlap.res$cell))

##  save

In [12]:
fwrite(overlap.res,'../dat/1901/alpha_beta.promoter.long_matrix_w_transcripts_corrected.txt')

### check idx

`(base) ➜  1901 git:(master) ✗ grep ABHD14A alpha_beta.promoter.long_matrix_w_transcripts_corrected.txt |awk -v FS=',' '{print $1,$2,$3,$4,$7}'| sort|uniq`

chr3 52007627 52009641 ABHD14A 92628

chr3 52007627 52009641 ABHD14A-ACY1 92628

`(base) ➜  1901 git:(master) ✗ grep ABHD14A alpha_beta.promoter.long_matrix_w_transcripts.txt |awk -v FS=',' '{print $1,$2,$3,$4,$7}'| sort|uniq`

chr3 52007627 52009641 ABHD14A 92628

chr3 52007627 52009641 ABHD14A-ACY1 92628

In [13]:
all.transcript[92628,]

seq,start,end,gene
chr3,52007200,52008200,ABHD14A
