## 1. Data load and check 

In [5]:
require(rtracklayer)
all.transcript <- import.bed('../dat/1901/gencode.v19.1kb_all_possible_transcripts.bed')
all.transcript

GRanges object with 133870 ranges and 1 metadata column:
           seqnames        ranges strand |        name
              <Rle>     <IRanges>  <Rle> | <character>
       [1]     chr1   68592-69591      * |       OR4F5
       [2]     chr1 138880-139879      * |  AL627309.1
       [3]     chr1 367141-368140      * |      OR4F29
       [4]     chr1 621554-622553      * |      OR4F16
       [5]     chr1 738638-739637      * |  AL669831.1
       ...      ...           ...    ... .         ...
  [133866]     chrM    9971-10970      * |     MT-ND4L
  [133867]     chrM   10261-11260      * |      MT-ND4
  [133868]     chrM   11838-12837      * |      MT-ND5
  [133869]     chrM   14174-15173      * |      MT-ND6
  [133870]     chrM   14248-15247      * |      MT-CYB
  -------
  seqinfo: 25 sequences from an unspecified genome; no seqlengths

In [52]:
require(data.table)
overlap.res <- fread('../dat/1901/alpha.promoter.long_matrix.txt')

In [53]:
head(overlap.res)

peak,gene,cluster,cell,overlap
1:859052-860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTAGGCAGAAGTAAGGAGCAGGA,1
1:859052-860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTATGCGCAGCGTCTAATGGTTG,1
1:859052-860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTCGAGGCTGAAGGCTATGGTTG,1
1:859052-860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTCGTACTAGCTAAGCCTGTACT,1
1:859052-860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTGGACTCCTTCGACTAGGGTTG,1
1:859052-860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTGGAGCTACAAGGAGTAAGGCG,1


## 2. Data manipulate

The goal is to add a `transcript` column into overlap res. Each `transcript` has a unique `tss`. 

- use [foverlap](https://www.rdocumentation.org/packages/data.table/versions/1.11.8/topics/foverlaps) function to get `peak` overlapping with `TSS` regions.  


In [54]:
require(tidyverse)
overlap.res<- overlap.res%>%
    separate(peak,into = c("seq","start","end"),sep = ":|-")%>%
    mutate(seq=paste0('chr',seq))
setDT(overlap.res)
overlap.res <-overlap.res[,c("start","end"):=lapply(.SD, as.numeric), .SDcols = c("start","end")]
head(overlap.res)

seq,start,end,gene,cluster,cell,overlap
chr1,859052,860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTAGGCAGAAGTAAGGAGCAGGA,1
chr1,859052,860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTATGCGCAGCGTCTAATGGTTG,1
chr1,859052,860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTCGAGGCTGAAGGCTATGGTTG,1
chr1,859052,860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTCGTACTAGCTAAGCCTGTACT,1
chr1,859052,860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTGGACTCCTTCGACTAGGGTTG,1
chr1,859052,860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTGGAGCTACAAGGAGTAAGGCG,1


In [55]:
transcripts.alpha <- fread('../dat/1901/alpha.transcript_promoter_peaks.bed',
                            col.names=c("seq",'start','end','gene'))
#transcripts.alpha$transcript.id <- make.names(transcripts.alpha$transcript.id,unique = T)
setkey(transcripts.alpha,seq,start,end)
head(transcripts.alpha)
dim(transcripts.alpha)

seq,start,end,gene
chr1,859052,860562,SAMD11
chr1,875573,875966,SAMD11
chr1,876931,878016,SAMD11
chr1,894277,895102,NOC2L
chr1,895801,896103,KLHL17
chr1,901696,902721,PLEKHN1


In [70]:
foverlap.res <- foverlaps(x = overlap.res,y = transcripts.alpha,
                 by.x=c("seq", "start", "end"),
                 type="any", which=T,mult="first")
range(foverlap.res)

In [74]:
overlap.res <- (overlap.res%>%mutate(transcript.idx=foverlap.res)%>%select(-overlap))
head(overlap.res)

seq,start,end,gene,cluster,cell,transcript.idx
chr1,859052,860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTAGGCAGAAGTAAGGAGCAGGA,1
chr1,859052,860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTATGCGCAGCGTCTAATGGTTG,1
chr1,859052,860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTCGAGGCTGAAGGCTATGGTTG,1
chr1,859052,860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTCGTACTAGCTAAGCCTGTACT,1
chr1,859052,860562,SAMD11,alpha_1,Islet1-fresh_AGACACCTGGACTCCTTCGACTAGGGTTG,1
chr1,859052,860562,SAMD11,alpha_2,Islet1-fresh_AGACACCTGGAGCTACAAGGAGTAAGGCG,1


### 3. Repeated for beta

In [77]:
overlap.res.beta <- fread('../dat/1901/beta.promoter.long_matrix.txt')%>%
    separate(peak,into = c("seq","start","end"),sep = ":|-")%>%
    mutate(seq=paste0('chr',seq))
setDT(overlap.res.beta)
overlap.res.beta <-overlap.res.beta[,c("start","end"):=lapply(.SD, as.numeric), .SDcols = c("start","end")]


## 
transcripts.beta <- fread('../dat/1901/beta.transcript_promoter_peaks.bed',
                            col.names=c("seq",'start','end','gene'))
setkey(transcripts.beta,seq,start,end)

##
overlap.res.beta <- overlap.res.beta%>%
    mutate(transcript.idx=foverlaps(x = overlap.res.beta,y = transcripts.beta,
                 by.x=c("seq", "start", "end"),
                 type="any", which=T,mult="first"))%>%
    select(-overlap)

##
head(overlap.res.beta)

seq,start,end,gene,cluster,cell,transcript.idx
chr1,859052,860562,SAMD11,beta_1,Islet1-fresh_AGACACCTACTCGCTATCGACTAGATAGA,1
chr1,859052,860562,SAMD11,beta_2,Islet1-fresh_AGACACCTACTCGCTATTCTAGCTGGTTG,1
chr1,859052,860562,SAMD11,beta_1,Islet1-fresh_AGACACCTAGGCAGAATCTCTCCGAGGCG,1
chr1,859052,860562,SAMD11,beta_1,Islet1-fresh_AGACACCTCGTACTAGAAGGAGTACAGGA,1
chr1,859052,860562,SAMD11,beta_1,Islet1-fresh_AGACACCTCTCTCTACACTGCATAAGGCG,1
chr1,859052,860562,SAMD11,beta_2,Islet1-fresh_AGACACCTCTCTCTACTATCCTCTTATAG,1


In [78]:
range(overlap.res.beta$transcript.idx)
dim(transcripts.beta)

### 4. Combine alpha and beta 

In [80]:
overlap.res <-rbind(overlap.res,overlap.res.beta)

In [5]:
dim(overlap.res)
length(unique(overlap.res$cell))

### 5. filtering using umap's results

In [11]:
dat.all.cells <- fread('../dat/output.umap.ab.filtered.csv')
head(dat.all.cells)
length(dat.all.cells$barcodes)

barcodes,UMAP1,UMAP2,cluster,cell_type_overall,subtype,log10_n_counts,log10_n_peaks,Islet1,Islet2,Islet3
Islet3-fresh_CTGAAGCTTGCAGCTACTCTCTATTATAGCCT,6.029433,-3.42981782,beta_2,beta,2,4.520863,4.152013,0,0,1
Islet3-fresh_TCCGCGAACGAGGCTGCCTAGAGTGTACTGAC,-10.054582,-0.77572841,alpha_1,alpha,1,4.095657,3.758761,0,0,1
Islet3-fresh_CGGCTATGAAGAGGCAGTAAGGAGCCTATCCT,6.807724,-4.28113995,beta_1,beta,1,3.953856,3.622214,0,0,1
Islet3-fresh_CGGCTATGGCGTAGTAAAGGAGTATATAGCCT,-10.170747,-2.35448799,alpha_1,alpha,1,4.025879,3.694781,0,0,1
Islet3-fresh_CTGAAGCTTAGGCATGCCTAGAGTGGTTGCGT,-11.246461,0.03809572,alpha_1,alpha,1,4.470131,4.087994,0,0,1
Islet3-fresh_GAGATTCCCGAGGCTGAAGGCTATAGGCGAAG,-11.169406,-0.60920586,alpha_1,alpha,1,4.115577,3.780821,0,0,1


In [10]:
overlap.res <- fread('../dat/1901/alpha_beta.promoter.long_matrix_w_transcripts.txt')

In [12]:
overlap.res <- overlap.res%>% filter(cell %in% dat.all.cells$barcodes)
dim(overlap.res)
length(unique(overlap.res$cell))

In [13]:
fwrite(overlap.res,'../dat/1901/alpha_beta.promoter.long_matrix_w_transcripts.txt')