# <u>Data Processing</u>

In [51]:
library(PharmacoGx, quietly=TRUE)
library(Biobase, quietly=TRUE)
library(calibrate, quietly=TRUE)
library(forestplot, quietly=TRUE)
library(survcomp, quietly=TRUE)
library(DESeq2, quietly=TRUE)
library(dplyr, quietly=TRUE)
library(stringr, quietly=TRUE)
library(gimme, quietly=TRUE)

In [52]:
options(stringsAsFactors = F)
#match cell.id to unique.cellid from PharmacoGx
matchToIDTable <- function(ids,tbl, column, returnColumn="unique.cellid") {
  sapply(ids, function(x) {
    myx <- grep(paste0("((///)|^)",Hmisc::escapeRegex(x),"((///)|$)"), tbl[,column])
    if(length(myx) > 1){
      stop("Something went wrong in curating ids, we have multiple matches")
    }
    if(length(myx) == 0){return(NA_character_)}
    return(tbl[myx, returnColumn])
  })
}

In [53]:
#creates list of all gene-drug combinations
expand.grid.unique <- function(x, y, include.equals=FALSE)
{
  x <- unique(x)
  y <- unique(y)
  g <- function(i)
  {
    z <- setdiff(y, x[seq_len(i-include.equals)])
    
    if(length(z)) cbind(x[i], z, deparse.level=0)
  }
  do.call(rbind, lapply(seq_along(x), g))
}

## Read in cell line annotations & convert to unique.cellid (gCSI, CCLE, GDSC)

In [54]:
##read in current cell annotations
cell_all <- read.csv(file = "rnaseq_meta/cell_annotation_all.csv", na.strings=c("", " ", "NA"))

In [55]:
#read in gcsi cell annotations
gcsi <- read.csv(file = "rnaseq_meta/gcsi_rnaseq_meta.csv")
gcsi$cellid <- matchToIDTable(ids=gcsi$Cell_line , tbl = cell_all, column = "GNE.cellid", returnColumn = "unique.cellid")
rownames(gcsi) <- gcsi$alias

#read in ccle cell annotations
ccle <- read.csv(file = "rnaseq_meta/ccle_rnaseq_meta.csv")
ccle$cellid <- matchToIDTable(ids=ccle$Cell_Line , tbl = cell_all, column = "CCLE.cellid", returnColumn = "unique.cellid")
rownames(ccle) <- ccle$Run

#read in gdsc cell annotations
gdsc <- read.csv(file = "rnaseq_meta/gdsc_rnaseq_meta.txt", sep = "\t")
gdsc <- gdsc[which(!gdsc$Comment.SUBMITTED_FILE_NAME. == "15552_5.cram"),]
gdsc$cellid <- matchToIDTable(ids=gdsc$Source.Name, tbl = cell_all, column = "GDSC_rnaseq.cellid", returnColumn = "unique.cellid")
gdsc$files <- gsub(".cram","",gdsc$Comment.SUBMITTED_FILE_NAME.)
rownames(gdsc) <- gdsc$files

## Read in circAtlas annotations for CIRI2 annotation

In [14]:
circAtlas <- read.csv("circAtlas/hg38_hg19_v2.0.txt", sep = "\t", header = F, na.strings = "-")
circAtlas <- circAtlas[-c(1,2),] 
colnames(circAtlas) <- c("species","circAtlas_ID","hg38","hg19","circBase","circRNADb","deepbase2","circpedia2")
#circAtlas$hg19 <- gsub("\\|","-",circAtlas$hg19)
circAtlas$circAtlas_ID[grep("chr", circAtlas$circAtlas_ID)] <- NA #remove ID's with hg38 coordinates.
circAtlas <- circAtlas[-which(duplicated(circAtlas$hg19)),] #remove circRNA coordinates that are duplicated (won't be able to identify which is which)
circAtlas <- circAtlas[-which(is.na(circAtlas$hg19)),]

## Read in circBase annotations for CIRCexplorer2 annotation

In [None]:
circBase <- read.csv()

## circRNA count matrix generation (summarizeCIRIMatrix, summarizeCIRCMatrix)

### Function accomplishes the following:
- creates circRNA count matrix for each respective circRNA_ID all samples in a dataset (e.g. gCSI) for each tool (CIRI2, CIRCexplorer2)

<u>*NOTE*:</u> utilizes summarizeCIRI/summarizeCIRC output (circRNA_IDs)

#### CIRI2

In [56]:
#import summarizeCIRI results to obtain all unique circRNA_ID's to create circRNA count matrix
load("summarizeCIRI.RData")
unique_ids <- unique(c(gcsi_ciri_summ$circRNA_ids, 
                       ccle_ciri_summ$circRNA_ids, 
                       gdsc_ciri_summ$circRNA_ids, 
                       hansen_ciri_summ$circRNA_ids, 
                       hansen_ciri_summ$circRNA_ids, 
                       gdsc_ribozero_ciri_summ$circRNA_ids))


In [57]:
summarizeCIRIMatrix <- function(dir_path, unique_circ_ids){
  
  ciri_files <- list.files(path=dir_path,  pattern = "\\.tsv$", full.names = T)
  circid_matrix <- data.frame(matrix(ncol=length(ciri_files), nrow = length(unique_circ_ids)))
  rownames(circid_matrix) <- unique_circ_ids
  
  for (f in 1:length(ciri_files)) {
    sample <- read.table(file = ciri_files[f], 
                         sep = '\t', 
                         skip = 1,
                         header = FALSE)
    
    colnames(sample) <- c("circRNA_ID","chr","circRNA_start","circRNA_end","junction_reads", "SM_MS_SMS", "non_junction_reads", "junction_reads_ratio", "circRNA_type", "gene_id", "strand","junction_reads_ID")
    sample_name <- gsub("\\..*","", ciri_files[f])
    sample_name <- gsub(".*/","", sample_name)
    sample <- sample[which(sample$circRNA_type=="exon"),]
    sample <- sample[which(sample$junction_reads >= 2),]
    
   #group junction reads by circRNA_ID
    circ_reads <- sample %>% 
      group_by(circRNA_ID) %>% 
      summarise(junction_reads = sum(junction_reads))
    
    circ_reads <- as.data.frame(circ_reads)
    rownames(circ_reads) <- circ_reads$circRNA_ID
    
    #add sample to circRNA_ID count matrix
    circid_matrix[rownames(circ_reads), f] <- circ_reads$junction_reads
    names(circid_matrix)[f] <- sample_name
    
  }
  circid_matrix <- circid_matrix[rowSums(is.na(circid_matrix)) != ncol(circid_matrix), ]#remove rows that have NO exp (NA) for any sample
  circid_matrix[is.na(circid_matrix)] <- 0
  return(circid_matrix)
  
} 

In [98]:
gcsi_ciri_matrix <- summarizeCIRIMatrix(dir_path = "results/CIRI2/gCSI/result", unique_circ_ids = unique_ids)
ccle_ciri_matrix <- summarizeCIRIMatrix(dir_path = "results/CIRI2/CCLE/result", unique_circ_ids = unique_ids)
gdsc_ciri_matrix <- summarizeCIRIMatrix(dir_path = "results/CIRI2/GDSC/result", unique_circ_ids = unique_ids)

#save(gcsi_ciri_matrix, ccle_ciri_matrix, gdsc_ciri_matrix, file= "ciri_gene_matrix.RData")

#because there are genes with no circRNA exp (0), we need to log2 normalize + 1 for each gene
gcsi_ciri_matrix_norm <- log2(gcsi_ciri_matrix + 1)
ccle_ciri_matrix_norm <- log2(ccle_ciri_matrix + 1)
gdsc_ciri_matrix_norm <- log2(gdsc_ciri_matrix + 1)

#use unique.cellids for column names of gene matrix & keep same column name order as gCSI
colnames(gcsi_ciri_matrix_norm) <- gsub("gcsi","", colnames(gcsi_ciri_matrix_norm))
colnames(gcsi_ciri_matrix_norm) <- gcsi$cellid[match(colnames(gcsi_ciri_matrix_norm), rownames(gcsi))]

colnames(gdsc_ciri_matrix_norm) <- gsub("gdsc","", colnames(gdsc_ciri_matrix_norm))
colnames(gdsc_ciri_matrix_norm) <- gdsc$cellid[match(colnames(gdsc_ciri_matrix_norm), rownames(gdsc))]
gdsc_ciri_matrix_norm <- gdsc_ciri_matrix_norm[names(gcsi_ciri_matrix_norm)]

colnames(ccle_ciri_matrix_norm) <- ccle$cellid[match(colnames(ccle_ciri_matrix_norm), rownames(ccle))]
ccle_ciri_matrix_norm <- ccle_ciri_matrix_norm[names(gcsi_ciri_matrix_norm)]

#### CIRCexplorer2

In [28]:
#create gene matrix for CIRCexplorer2 results
summarizeCIRCMatrix <- function(dir_path){
  
  circ_files <- list.files(path=dir_path,  pattern = ".txt", full.names = T, recursive = TRUE)
  circRNA_ids <- list()
  gene_matrix <- data.frame(matrix(ncol=length(circ_files), nrow = length(rownames(features_gene))))
  rownames(gene_matrix) <- rownames(features_gene)
  
  for (f in 1:length(circ_files)) {
    sample <- read.table(file = circ_files[f], 
                         sep = '\t', 
                         header = FALSE)
    
    colnames(sample) <- c("chrom","start","end", "name","score","strand","thickStart","thickEnd","itemRgb","exonCount","exonSizes", "exonOffsets", "readNumber", "circType", "geneName", "isoformName", "index", "flankIntron")
    sample_name <- sub(".*/ *(.*?) *.txt.*", "\\1", circ_files[f])
    sample_name <- gsub("_circularRNA_known", "", sample_name)
    sample <- sample[which(sample$circType=="circRNA"),]
    sample <- sample[which(sample$readNumber >= 2),]
    
    #get gene-id from gencode v33lift37 feature_transcript data-frame, as circexplorer2 does not provide it in result files
    
    sample$gene_id <- features_transcript$gene_id[match(sample$isoformName, features_transcript$transcript_id)]
    
    #group sum of junction read number by gene_id
    gene_reads <- sample %>% 
      group_by(gene_id) %>% 
      summarise(readNumber = sum(readNumber))
    
    gene_reads <- as.data.frame(gene_reads)
    
    rownames(gene_reads) <- gene_reads$gene_id
    gene_matrix[rownames(gene_reads), f] <- gene_reads$readNumber
    names(gene_matrix)[f] <- sample_name
    
    #create circRNA position ID, since not given, only in CIRI2 is it given automatically
    sample$circRNA_ID <- paste0(sample$chrom,":",sample$start,"|",sample$end)
    
    #group circRNA_id by gene ID
    circ_ids <- as.data.frame(sample %>% 
                                group_by(gene_id) %>% 
                                summarise(circRNA_ID = paste(circRNA_ID, collapse = ",")))
    
    circRNA_ids <- append(circRNA_ids, list(circ_ids), f)
    names(circRNA_ids)[(f)] <- sample_name
    
    
  }
  gene_matrix <- gene_matrix[rowSums(is.na(gene_matrix)) != ncol(gene_matrix), ]#remove rows that have NO exp (NA) for any sample
  gene_matrix[is.na(gene_matrix)] <- 0
  xx <- list(gene_matrix, circRNA_ids)
  names(xx)[1] <- "gene_matrix"
  names(xx)[2] <- "circRNA_ids"
  return(xx)
} 


In [39]:
gcsi_circ_matrix_all <- summarizeCIRCMatrix(dir_path = "results/CIRCexplorer2/gCSI/unmapped_method/annotate")
ccle_circ_matrix_all <- summarizeCIRCMatrix(dir_path = "results/CIRCexplorer2/CCLE/unmapped_method/annotate")
gdsc_circ_matrix_all <- summarizeCIRCMatrix(dir_path = "results/CIRCexplorer2/GDSC/unmapped_method/annotate")

save(gcsi_circ_matrix_all, ccle_circ_matrix_all, gdsc_circ_matrix_all, file= "circexplorer_gene_matrix.RData")

#extract only gene_matrix
gcsi_circ_matrix <- gcsi_circ_matrix_all$gene_matrix
ccle_circ_matrix <- ccle_circ_matrix_all$gene_matrix
gdsc_circ_matrix <- gdsc_circ_matrix_all$gene_matrix


#Because there are genes with no circRNA exp (0), we need to log2 normalize + 1 for each gene
gcsi_circ_matrix_norm <- log2(gcsi_circ_matrix + 1)
ccle_circ_matrix_norm <- log2(ccle_circ_matrix + 1)
gdsc_circ_matrix_norm <- log2(gdsc_circ_matrix + 1)


#use unique.cellids for column names of gene matrix & keep same column name order as gCSI
colnames(gcsi_circ_matrix_norm) <- gsub("gcsi","", colnames(gcsi_circ_matrix_norm))
colnames(gcsi_circ_matrix_norm) <- gcsi$cellid[match(colnames(gcsi_circ_matrix_norm), rownames(gcsi))]

colnames(gdsc_circ_matrix_norm) <- gsub("gdsc","", colnames(gdsc_circ_matrix_norm))
colnames(gdsc_circ_matrix_norm) <- gdsc$cellid[match(colnames(gdsc_circ_matrix_norm), rownames(gdsc))]
gdsc_circ_matrix_norm <- gdsc_circ_matrix_norm[names(gcsi_circ_matrix_norm)]

colnames(ccle_circ_matrix_norm) <- ccle$cellid[match(colnames(ccle_circ_matrix_norm), rownames(ccle))]
ccle_circ_matrix_norm <- ccle_circ_matrix_norm[names(gcsi_circ_matrix_norm)]


## Compute concordance index (CI) for all datasets (gCSI, CCLE, GDSC)

### import PSets (drug sensitivity data)

In [59]:
gCSI <- readRDS("gCSI.rds") #2017 drug sensitivity

In [60]:
CTRPv2 <- readRDS("CTRPv2.rds")

In [61]:
CCLE <- readRDS("CCLE.rds")

In [62]:
GDSC2 <- readRDS("GDSC2.rds") #v8.2 (Feb2020 drug sensitivity)

### get average circRNA count for SR technical replicate in gCSI

In [63]:
#CIRI2 matrix
combined_mean <- as.data.frame(rowMeans(gcsi_ciri_matrix_norm[, c("SR", "SR")], na.rm = TRUE))
idx <- which(duplicated(colnames(gcsi_ciri_matrix_norm)))
gcsi_ciri_matrix_norm <- gcsi_ciri_matrix_norm[,-idx]
v <- combined_mean$`rowMeans(gcsi_ciri_matrix_norm[, c("SR", "SR")], na.rm = TRUE)`
gcsi_ciri_matrix_norm$SR <- v

In [35]:
#CIRCexplorer2 matrix
combined_mean <- as.data.frame(rowMeans(gcsi_circ_matrix_norm[, c("SR", "SR")], na.rm = TRUE))
idx <- which(duplicated(colnames(gcsi_circ_matrix_norm)))
gcsi_circ_matrix_norm <- gcsi_circ_matrix_norm[,-idx]
v <- combined_mean$`rowMeans(gcsi_circ_matrix_norm[, c("SR", "SR")], na.rm = TRUE)`
gcsi_circ_matrix_norm$SR <- v

### summarizeDrugSensitivity for all datasets

In [64]:
intersected_cells <- colnames(gcsi_ciri_matrix_norm) #intersected biological replicates
gcsi_summarize <- summarizeSensitivityProfiles(pSet = gCSI, sensitivity.measure = "aac_recomputed", fill.missing = F)
ctrpv2_summarize <- summarizeSensitivityProfiles(pSet = CTRPv2, sensitivity.measure = "aac_recomputed", cell.lines = intersected_cells, fill.missing = F)
ccle_summarize <- summarizeSensitivityProfiles(pSet = CCLE, sensitivity.measure = "aac_recomputed", cell.lines = intersected_cells, fill.missing = F)
gdsc_summarize <- summarizeSensitivityProfiles(pSet = GDSC2, sensitivity.measure = "aac_recomputed", cell.lines = intersected_cells, fill.missing = F)


### compute CI for each gene-drug combination

In [65]:
#computes concordance index (CI) for each dataset, using Gencode v33lift37 annotations
#circRNA_normalzed_data = log2 + 1 circRNA matrix (CIRI2, CIRCexplorer2)
#sensitivity_data = result of 'summarizeSensitivityProfiles' in PharmacoGx
#samples = commonSamples

computeCI <- function(circRNA_normalized_data, sensitivity_data, samples){
  
  circRNA <- rownames(circRNA_normalized_data)
  drugs <- unique(rownames(sensitivity_data))
  commonSamples <- samples
  combinations <- as.data.frame(expand.grid.unique(circRNA, drugs, include.equals = TRUE))
  combinations$ci <- NA
  combinations$pvalue <- NA
  colnames(combinations) <- c("circRNA","drug","ci","pvalue")
  
  for (i in 1:nrow(combinations)){
    #print(paste0(i, " out of ", nrow(combinations), " complete"))
    tt <- sensitivity_data[combinations[,2][i],commonSamples]
    tt[which(is.na(tt))] <- 0 #some sensitivities are NA due to filterNoisyCurve function, which causes error when running CI with survcomp
    ci <- survcomp::concordance.index(tt, surv.time = unlist(-gcsi_ciri_matrix_norm[combinations[,1][i], commonSamples]), surv.event = rep(1,length(sensitivity_data[commonSamples])),outx = F, method="noether")
    combinations$pvalue[i] <- ci$p.value
    combinations$ci[i] <- ci$c.index
  }
  
  return(combinations)
}

#### gCSI

In [99]:
#keep only samples where no more than 35 samples (~70%) have 0 circRNA count for a given circRNA coordinate
gcsi_ciri_matrix_norm <- gcsi_ciri_matrix_norm[rowSums(gcsi_ciri_matrix_norm == 0) <= 35, ]

In [100]:
#gCSI compute CI
commonSamples <- intersect(colnames(gcsi_summarize),colnames(gcsi_ciri_matrix_norm)) #only samples with drug sensitivity in gCSI
gcsi_ci_result <- computeCI(circRNA_normalized_data=gcsi_ciri_matrix_norm, sensitivity_data=gcsi_summarize, samples = commonSamples)

In [101]:
#CCLE compute CI (circRNA samples not found in gCSI sensitivity)
commonSamples <- intersect(colnames(ccle_summarize),colnames(gcsi_ciri_matrix_norm)) #only samples with drug sensitivity in CCLE
ccle_ci_result <- computeCI(circRNA_normalized_data=gcsi_ciri_matrix_norm, sensitivity_data=ccle_summarize, samples = commonSamples)

In [102]:
#GDSC compute CI (circRNA samples not found in gCSI sensitivity)
commonSamples <- intersect(colnames(gdsc_summarize),colnames(gcsi_ciri_matrix_norm)) #only samples with drug sensitivity in GDSC
gdsc_ci_result <- computeCI(circRNA_normalized_data=gcsi_ciri_matrix_norm, sensitivity_data=gdsc_summarize, samples = commonSamples)

In [103]:
#CTRPv2 compute CI (circRNA samples not found in gCSI sensitivity)
commonSamples <- intersect(colnames(ctrpv2_summarize),colnames(gcsi_ciri_matrix_norm)) #only samples with drug sensitivity in GDSC
ctrpv2_ci_result <- computeCI(circRNA_normalized_data=gcsi_ciri_matrix_norm, sensitivity_data=ctrpv2_summarize, samples = commonSamples)

In [71]:
#save all computed CI's
save(gcsi_ci_result, file="gcsi_ci_result/gcsi_CI.RData")
save(ccle_ci_result, file="gcsi_ci_result/ccle_CI.RData")
save(gdsc_ci_result, file="gcsi_ci_result/gdsc_CI.RData")
save(ctrpv2_ci_result, file="gcsi_ci_result/ctrpv2_CI.RData")

#### CCLE

In [75]:
#keep only samples where no more than 35 samples (~70%) have 0 circRNA count for a given circRNA coordinate
ccle_ciri_matrix_norm <- ccle_ciri_matrix_norm[rowSums(ccle_ciri_matrix_norm == 0) <= 35, ]

In [76]:
#CCLE compute CI
commonSamples <- intersect(colnames(ccle_summarize),colnames(ccle_ciri_matrix_norm)) #only samples with drug sensitivity in CCLE
ccle_ci_result <- computeCI(circRNA_normalized_data=ccle_ciri_matrix_norm, sensitivity_data=ccle_summarize, samples = commonSamples)

In [77]:
#gCSI compute CI (circRNA samples not found in CCLE sensitivity)
commonSamples <- intersect(colnames(gcsi_summarize),colnames(ccle_ciri_matrix_norm)) #only samples with drug sensitivity in gCSI
gcsi_ci_result <- computeCI(circRNA_normalized_data=ccle_ciri_matrix_norm, sensitivity_data=gcsi_summarize, samples = commonSamples)

In [78]:
#GDSC compute CI (circRNA samples not found in CCLE sensitivity)
commonSamples <- intersect(colnames(gdsc_summarize),colnames(ccle_ciri_matrix_norm)) #only samples with drug sensitivity in GDSC
gdsc_ci_result <- computeCI(circRNA_normalized_data=ccle_ciri_matrix_norm, sensitivity_data=gdsc_summarize, samples = commonSamples)

In [79]:
#CTRPv2 compute CI (circRNA samples not found in CCLE sensitivity)
commonSamples <- intersect(colnames(ctrpv2_summarize),colnames(ccle_ciri_matrix_norm)) #only samples with drug sensitivity in GDSC
ctrpv2_ci_result <- computeCI(circRNA_normalized_data=ccle_ciri_matrix_norm, sensitivity_data=ctrpv2_summarize, samples = commonSamples)

In [80]:
#save all computed CI's
save(gcsi_ci_result, file="ccle_ci_result/gcsi_CI.RData")
save(ccle_ci_result, file="ccle_ci_result/ccle_CI.RData")
save(gdsc_ci_result, file="ccle_ci_result/gdsc_CI.RData")
save(ctrpv2_ci_result, file="ccle_ci_result/ctrpv2_CI.RData")

#### GDSC

In [104]:
#keep only samples where no more than 35 samples (~70%) have 0 circRNA count for a given circRNA coordinate
gdsc_ciri_matrix_norm <- gdsc_ciri_matrix_norm[rowSums(gdsc_ciri_matrix_norm == 0) <= 35, ]

In [105]:
#GDSC compute CI
commonSamples <- intersect(colnames(gdsc_summarize),colnames(gdsc_ciri_matrix_norm)) #only samples with drug sensitivity in CCLE
gdsc_ci_result <- computeCI(circRNA_normalized_data=gdsc_ciri_matrix_norm, sensitivity_data=gdsc_summarize, samples = commonSamples)

In [106]:
#CCLE compute CI (circRNA samples not found in GDSC sensitivity)
commonSamples <- intersect(colnames(ccle_summarize),colnames(gdsc_ciri_matrix_norm)) #only samples with drug sensitivity in CCLE
ccle_ci_result <- computeCI(circRNA_normalized_data=gdsc_ciri_matrix_norm, sensitivity_data=ccle_summarize, samples = commonSamples)

In [107]:
#gCSI compute CI (circRNA samples not found in GDSC sensitivity)
commonSamples <- intersect(colnames(gcsi_summarize),colnames(gdsc_ciri_matrix_norm)) #only samples with drug sensitivity in gCSI
gcsi_ci_result <- computeCI(circRNA_normalized_data=gdsc_ciri_matrix_norm, sensitivity_data=gcsi_summarize, samples = commonSamples)

In [108]:
#CTRPv2 compute CI (circRNA samples not found in GDSC sensitivity)
commonSamples <- intersect(colnames(ctrpv2_summarize),colnames(gdsc_ciri_matrix_norm)) #only samples with drug sensitivity in GDSC
ctrpv2_ci_result <- computeCI(circRNA_normalized_data=gdsc_ciri_matrix_norm, sensitivity_data=ctrpv2_summarize, samples = commonSamples)

In [109]:
#save all computed CI's
save(gcsi_ci_result, file="gdsc_ci_result/gcsi_CI.RData")
save(ccle_ci_result, file="gdsc_ci_result/ccle_CI.RData")
save(gdsc_ci_result, file="gdsc_ci_result/gdsc_CI.RData")
save(ctrpv2_ci_result, file="gdsc_ci_result/ctrpv2_CI.RData")

## Identify significant gene-drug associations

#### gCSI

In [84]:
#filters for CI > 0.5 and p-value < 0.05
gcsi_ci_sig <- gcsi_ci_result[which(gcsi_ci_result$pvalue < 0.05 & gcsi_ci_result$ci > 0.5),]
ccle_ci_sig <- ccle_ci_result[which(ccle_ci_result$pvalue < 0.05 & ccle_ci_result$ci > 0.5),]
gdsc_ci_sig <- gdsc_ci_result[which(gdsc_ci_result$pvalue < 0.05 & gdsc_ci_result$ci > 0.5),]
ctrpv2_ci_sig <- ctrpv2_ci_result[which(ctrpv2_ci_result$pvalue < 0.05 & ctrpv2_ci_result$ci > 0.5),]

#assign unique.id for each circRNA_drug combination
gcsi_ci_sig$id <- paste0(gcsi_ci_sig$circRNA,"_",gcsi_ci_sig$drug)
ccle_ci_sig$id <- paste0(ccle_ci_sig$circRNA,"_",ccle_ci_sig$drug)
gdsc_ci_sig$id <- paste0(gdsc_ci_sig$circRNA,"_",gdsc_ci_sig$drug)
ctrpv2_ci_sig$id <- paste0(ctrpv2_ci_sig$circRNA,"_",ctrpv2_ci_sig$drug)

#keep unique.id's in that are not in gCSI already
ccle_ci_sig <- ccle_ci_sig[which(!ccle_ci_sig$id %in% gcsi_ci_sig$id),]
gdsc_ci_sig <- gdsc_ci_sig[which(!gdsc_ci_sig$id %in% gcsi_ci_sig$id),]
ctrpv2_ci_sig <- ctrpv2_ci_sig[which(!ctrpv2_ci_sig$id %in% gcsi_ci_sig$id),]

#combined gCSI significant gene-drug associations (unique only)
gcsi_ci_combined <- do.call("rbind", list(gcsi_ci_sig, ccle_ci_sig, gdsc_ci_sig, ctrpv2_ci_sig))
gcsi_ci_combined <- gcsi_ci_combined[!duplicated(gcsi_ci_combined[ , "id"]),]

In [86]:
#sort gene-drug associations by largest CI and smallest p-value
gcsi_ci_combined_sort <- gcsi_ci_combined[order(gcsi_ci_combined$ci, rev(gcsi_ci_combined$pvalue), decreasing = TRUE), ]

#### CCLE

In [87]:
#filters for CI > 0.5 and p-value < 0.05
gcsi_ci_sig <- gcsi_ci_result[which(gcsi_ci_result$pvalue < 0.05 & gcsi_ci_result$ci > 0.5),]
ccle_ci_sig <- ccle_ci_result[which(ccle_ci_result$pvalue < 0.05 & ccle_ci_result$ci > 0.5),]
gdsc_ci_sig <- gdsc_ci_result[which(gdsc_ci_result$pvalue < 0.05 & gdsc_ci_result$ci > 0.5),]
ctrpv2_ci_sig <- ctrpv2_ci_result[which(ctrpv2_ci_result$pvalue < 0.05 & ctrpv2_ci_result$ci > 0.5),]

#assign unique.id for each circRNA_drug combination
gcsi_ci_sig$id <- paste0(gcsi_ci_sig$circRNA,"_",gcsi_ci_sig$drug)
ccle_ci_sig$id <- paste0(ccle_ci_sig$circRNA,"_",ccle_ci_sig$drug)
gdsc_ci_sig$id <- paste0(gdsc_ci_sig$circRNA,"_",gdsc_ci_sig$drug)
ctrpv2_ci_sig$id <- paste0(ctrpv2_ci_sig$circRNA,"_",ctrpv2_ci_sig$drug)

#keep unique.id's in that are not in CCLE already
gcsi_ci_sig <- gcsi_ci_sig[which(!gcsi_ci_sig$id %in% ccle_ci_sig$id),]
gdsc_ci_sig <- gdsc_ci_sig[which(!gdsc_ci_sig$id %in% ccle_ci_sig$id),]
ctrpv2_ci_sig <- ctrpv2_ci_sig[which(!ctrpv2_ci_sig$id %in% ccle_ci_sig$id),]

#combined gCSI significant gene-drug associations (unique only)
ccle_ci_combined <- do.call("rbind", list(gcsi_ci_sig, ccle_ci_sig, gdsc_ci_sig, ctrpv2_ci_sig))
ccle_ci_combined <- ccle_ci_combined[!duplicated(ccle_ci_combined[ , "id"]),]

In [90]:
#sort gene-drug associations by largest CI and smallest p-value
ccle_ci_combined_sort <- ccle_ci_combined[order(ccle_ci_combined$ci, rev(ccle_ci_combined$pvalue), decreasing = TRUE), ]

#### GDSC 

In [110]:
#filters for CI > 0.5 and p-value < 0.05
gcsi_ci_sig <- gcsi_ci_result[which(gcsi_ci_result$pvalue < 0.05 & gcsi_ci_result$ci > 0.5),]
ccle_ci_sig <- ccle_ci_result[which(ccle_ci_result$pvalue < 0.05 & ccle_ci_result$ci > 0.5),]
gdsc_ci_sig <- gdsc_ci_result[which(gdsc_ci_result$pvalue < 0.05 & gdsc_ci_result$ci > 0.5),]
ctrpv2_ci_sig <- ctrpv2_ci_result[which(ctrpv2_ci_result$pvalue < 0.05 & ctrpv2_ci_result$ci > 0.5),]

#assign unique.id for each circRNA_drug combination
gcsi_ci_sig$id <- paste0(gcsi_ci_sig$circRNA,"_",gcsi_ci_sig$drug)
ccle_ci_sig$id <- paste0(ccle_ci_sig$circRNA,"_",ccle_ci_sig$drug)
gdsc_ci_sig$id <- paste0(gdsc_ci_sig$circRNA,"_",gdsc_ci_sig$drug)
ctrpv2_ci_sig$id <- paste0(ctrpv2_ci_sig$circRNA,"_",ctrpv2_ci_sig$drug)

#keep unique.id's in that are not in CCLE already
gcsi_ci_sig <- gcsi_ci_sig[which(!gcsi_ci_sig$id %in% gdsc_ci_sig$id),]
ccle_ci_sig <- ccle_ci_sig[which(!ccle_ci_sig$id %in% gdsc_ci_sig$id),]
ctrpv2_ci_sig <- ctrpv2_ci_sig[which(!ctrpv2_ci_sig$id %in% gdsc_ci_sig$id),]

#combined gCSI significant gene-drug associations (unique only)
gdsc_ci_combined <- do.call("rbind", list(gcsi_ci_sig, ccle_ci_sig, gdsc_ci_sig, ctrpv2_ci_sig))
gdsc_ci_combined <- gdsc_ci_combined[!duplicated(gdsc_ci_combined[ , "id"]),]