# Load datasets

In [1]:
# Human lncRNA GENCODE v30 information
lncRNA <- read.csv('./gencode.v30.lncRNA_transcripts_major_compact.txt',sep = '\t',header = T) #16153
# Modify ensebl gene id of lncRNA
lncRNA[1] <- apply(lncRNA[1],1,function(x) {strsplit(x, split='.', fixed=TRUE)[[1]][1]})

# Three raw datasets
lncATLAs <- read.csv('2021-01-29_lncATLAS_noncoding_data_RCI.tsv',sep = '\t', header = T) # 15929 ncRNAs * 15 = 238935
APEX = read.csv('2019_CELL_APEXSeq.tsv',sep ='\t', header = T) # 3335 RNAs
Cefra = read.csv('2018_CeFra_Seq_polyA_plus.tsv',sep = '\t',header = T) # 63677 RNAs

# lncATLAs

In [2]:
# Reshape to format, col:cell lines; row: gene
lncATLAs <- na.omit(lncATLAs)
# 6768 lncRNAs with at least 1 RCI across 15 cellines
lncATLAs_reshape <- reshape(lncATLAs[,c(1,2,4)],v.names = 'Value',idvar = 'ENSEMBL.ID',timevar = 'Data.Source',direction = 'wide')
rownames(lncATLAs_reshape) <- lncATLAs_reshape[,1]
lncATLAs_reshape <- lncATLAs_reshape[,-1]

In [3]:
head(lncATLAs_reshape)

Unnamed: 0_level_0,Value.MCF.7,Value.A549,Value.GM12878,Value.H1.hESC,Value.HeLa.S3,Value.HepG2,Value.HT1080,Value.HUVEC,Value.IMR.90,Value.K562,Value.NCI.H460,Value.NHEK,Value.SK.MEL.5,Value.SK.N.DZ,Value.SK.N.SH
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000082929,1.23491,,,,,,,,,,,,,,
ENSG00000093100,-2.52324,-3.15798,-2.00351,-2.2182,-2.05661,-2.93963,-2.53315,-2.64088,-3.17702,-1.87124,-1.92943,-2.41763,-1.86131,-1.49035,-3.9866
ENSG00000099869,,,,1.0,,0.00846158,,,,,,,,,
ENSG00000100181,,,,-0.192645,,,,-1.01879,,-0.404775,,0.0238308,-0.161441,,
ENSG00000116652,-4.88753,-1.848,-3.42824,1.65208,,-4.42626,,,,,-4.98868,-2.97982,,,
ENSG00000116883,-4.22716,-3.03986,,-1.84143,,-4.2381,,-4.46698,-4.8799,-2.25852,-3.67007,-4.49076,,,-5.17126


In [2]:
# Reshape to format, col:cell lines; row: gene
lncATLAs <- na.omit(lncATLAs)
# 6768 lncRNAs with at least 1 RCI across 15 cellines
lncATLAs_reshape <- reshape(lncATLAs[,c(1,2,4)],v.names = 'Value',idvar = 'ENSEMBL.ID',timevar = 'Data.Source',direction = 'wide')
rownames(lncATLAs_reshape) <- lncATLAs_reshape[,1]
lncATLAs_reshape <- lncATLAs_reshape[,-1]

# Remove RCI value in H1 cell line since the discordance
lncATLAs_reshape <- lncATLAs_reshape[,-4]
# Calculate the average RCI across cell lines for each lncRNA
mean_NAomit <- function(x) {
  RCIsum = count = 0
  for(i in na.omit(x)) {
      RCIsum <- RCIsum+i 
      count <- count+1
  }
  return(RCIsum/count)
} 

RCI_mean <- as.data.frame(apply(lncATLAs_reshape,1,mean_NAomit))
ensembl_gene_id <- row.names(RCI_mean)
RCI_mean <- as.data.frame(cbind(ensembl_gene_id,RCI_mean));
colnames(RCI_mean) <- c('ensembl_gene_id','RCI_mean')
# 5760 lncRNAs with at least 1 RCI across 14 cellines
RCI_mean <- na.omit(RCI_mean) # 5760


lncATLAS_Nuc <- as.character(RCI_mean[which(RCI_mean$RCI_mean < -2),'ensembl_gene_id']) # 1983
lncATLAS_Cyto <- as.character(RCI_mean[which(RCI_mean$RCI_mean > 0),'ensembl_gene_id']) # 1525

In [3]:
nrow(lncATLAs_reshape)

In [4]:
nrow(RCI_mean)

In [5]:
length(lncATLAS_Nuc)

In [6]:
length(lncATLAS_Cyto)

# APEX-Seq

In [7]:
# Select human lncRNAs 
APEX = APEX[which(APEX$Ensembl_Gene %in% lncRNA$ensembl_gene_id),] #61

# For APEX (as mentioned in the original article, log fd > 0.75 is considered for enrichment)
# Collapsed 8 loc into binary loc
Nuc=vector(); Cyto=vector()
for (i in c(1:nrow(APEX))) {
  if (APEX[i,'Nucleus_log2FC']>0.75) {Nuc[i] = 1}
  else if (APEX[i,'Nucleolus_log2FC']>0.75) {Nuc[i] = 1}
  else if (APEX[i,'Lamina_log2FC']>0.75) {Nuc[i] = 1}
  else if (APEX[i,'Nuclear_Pore_log2FC']>0.75) {Nuc[i] = 1}
  else {Nuc[i] = 0}
  if (APEX[i,'Cytosol_log2FC']>0.75) {Cyto[i] = 1}
  else if (APEX[i,'ERM_log2FC']>0.75) {Cyto[i] = 1}
  else if (APEX[i,'OMM_log2FC']>0.75) {Cyto[i] = 1}
  else if (APEX[i,'ER_Lumen_log2FC']>0.75) {Cyto[i] = 1}
  else {Cyto[i] = 0}
}
APEX_loc <- as.data.frame(cbind(APEX,Nuc,Cyto)) # 62
#APEX_Nuc <- as.character(APEX_loc[which(APEX_loc$Nuc == 1 & APEX_loc$Cyto == 0),'Ensembl_Gene']) # 42 of 56
#APEX_Cyto <- as.character(APEX_loc[which(APEX_loc$Nuc == 0 & APEX_loc$Cyto == 1),'Ensembl_Gene']) # 5 of 19

In [9]:
length(APEX_Nuc)

In [10]:
length(APEX_Cyto)

# Cefra-Seq

In [11]:
# 14746 lncRNAs
Cefra <- Cefra[which(Cefra$gene_biotype %in% c('antisense','sense_intronic','lincRNA','processed_transcript')),]

ExpressedRNA <- function(cyto_A,cyto_B,insol_A,insol_B,membr_A,membr_B,nucl_A,nucl_B){
  exp_cyto <- (cyto_A+cyto_B)/2; exp_insol = (insol_A+insol_B)/2
  exp_membr <- (membr_A+membr_B)/2; exp_nucl = (nucl_A+nucl_B)/2
  if (exp_cyto >=1|exp_insol>=1|exp_membr>=1|exp_nucl>=1) {expressed <- 1} else{expressed <- 0}
  if (expressed == 1) {CNRCI <- max(exp_cyto,exp_insol,exp_membr)/(max(exp_cyto,exp_insol,exp_membr)+exp_nucl)} else{CNRCI<-0}
  return(list(expressed = expressed,CNRCI = CNRCI))
}

expressed = vector(); CNRCI= vector()
for (i in c(1:nrow(Cefra))) {
  result <- ExpressedRNA(Cefra[i,'cyto_A'],Cefra[i,'cyto_B'],Cefra[i,'insol_A'],Cefra[i,'insol_B'],
               Cefra[i,'membr_A'],Cefra[i,'membr_B'],Cefra[i,'nucl_A'],Cefra[i,'nucl_B'])
  expressed[i] <- result$expressed; CNRCI[i] <- result$CNRCI
}

Cefra_high <- Cefra[which(expressed == 1),] # 1621 lncRNAs with high expression
CN_RCI_high <- as.data.frame(unlist(CNRCI[which(expressed == 1)]))
colnames(CN_RCI_high) <- "CN_RCI"
Cefra_high <- as.data.frame(cbind(Cefra_high,CN_RCI_high))

Cefra_Nuc = as.character(Cefra_high[which(Cefra_high$CN_RCI<0.4),'ensembl_gene_id']) # 528
Cefra_Cyto = as.character(Cefra_high[which(Cefra_high$CN_RCI>0.6),'ensembl_gene_id']) # 916

In [12]:
nrow(Cefra)

In [13]:
nrow(Cefra_high)

In [14]:
length(Cefra_Nuc)

In [15]:
length(Cefra_Cyto)

# Union and remove bi-localized lncRNAs 

In [16]:
# Union
Nuc_Union <- union(union(APEX_Nuc,Cefra_Nuc),lncATLAS_Nuc)
Cyto_Union <- union(union(APEX_Cyto,Cefra_Cyto),lncATLAS_Cyto)

In [17]:
# Remove bi-localized lncRNAs
Nuc_final <- as.data.frame(setdiff(Nuc_Union,Cyto_Union))
Cyto_final <- as.data.frame(setdiff(Cyto_Union,Nuc_Union))
colnames(Nuc_final) = colnames(Cyto_final) = 'ensembl_gene_id'

In [18]:
nrow(Nuc_final)

In [19]:
nrow(Cyto_final)

# In gencode annotation

In [20]:
Nuc_info <- merge.data.frame(lncRNA,Nuc_final,by = 'ensembl_gene_id')
Cyto_info <- merge.data.frame(lncRNA,Cyto_final,by = 'ensembl_gene_id')

In [21]:
nrow(Nuc_info)

In [22]:
nrow(Cyto_info)

# Output

In [23]:
write.csv(Nuc_info,'lncRNA_info_nuc_woRNALocate.csv',quote = FALSE,row.names = FALSE)
write.csv(Cyto_info,'lncRNA_info_cyto_woRNALocate.csv',quote = FALSE,row.names = FALSE)