# Comparison in cancer

The various factorization methods are here compared for their performances on multi-omics data. The comparison is organized into association to survival, clinical annotations and biological annotations (GO, REACTOME, Hallmarks).

## Comparison based on clinical annotations

Function that computes selectivity and number of clinical annotations found significantly associated with at least a factor. The clinical metadata should be all contained in a clinical folder containing files having the name of each cancer type. 

In [1]:
clinical_comparison <- function(factorizations,clinical,col){


line<-numeric(0)
line2<-numeric(0)
line3<-numeric(0)

    for(i in 1:length(factorizations)){
        factors<-factorizations[[i]][[1]]

        ##### match samples factors with clinical annotations
        patient.names = rownames(factors)
        patient.names.in.file = as.character(clinical[,1])
        patient.names.in.file = toupper(gsub('-', '\\.', patient.names.in.file))
        if(all(patient.names %in% patient.names.in.file)==FALSE){
            w<-which(!is.na(match(patient.names,patient.names.in.file)))
            patient.names<-patient.names[w]
            factors<-factors[w,]
            rownames(factors)<-patient.names
        }
        indices = match(patient.names, patient.names.in.file)
        ordered.clinical.data = clinical[indices,]
        col_present<-0
        pvalues<-numeric(0)

        m<-match(col,colnames(ordered.clinical.data))
        w<-which(!is.na(m))
        col_new<-col[w]

        column<-numeric(0)
        clin_erich<-0 
        
        ####test significance association with clinical annotations
        for(j in col_new){
            if(length(which(table(ordered.clinical.data[,j])>0))>1){
                col_present<-col_present+1
                if(j == "age_at_initial_pathologic_diagnosis" ){
                    column<-c(column,colnames(factors)[which(apply(factors,MARGIN=2, function(x) kruskal.test(x~cut(as.numeric(ordered.clinical.data[,j]),5, include.lowest=TRUE))$p.value)<0.05)])
                    pvalues<-c(pvalues,apply(factors,MARGIN=2, function(x) kruskal.test(x~cut(as.numeric(ordered.clinical.data[,j]),5, include.lowest=TRUE))$p.value))
                    if(min(apply(factors,MARGIN=2, function(x) kruskal.test(x~cut(as.numeric(ordered.clinical.data[,j]),5, include.lowest=TRUE))$p.value))<0.05){
                        clin_erich<-clin_erich+1
                    }
                }else if(j == "days_to_new_tumor_event_after_initial_treatment"){
                    column<-c(column,colnames(factors)[which(apply(factors,MARGIN=2, function(x) kruskal.test(x~cut(as.numeric(ordered.clinical.data[,j]), 3, include.lowest=TRUE))$p.value)<0.05)])
                    pvalues<-c(pvalues,apply(factors,MARGIN=2, function(x) kruskal.test(x~cut(as.numeric(ordered.clinical.data[,j]), 3, include.lowest=TRUE))$p.value))
                    if(min(apply(factors,MARGIN=2, function(x) kruskal.test(x~cut(as.numeric(ordered.clinical.data[,j]), 3, include.lowest=TRUE))$p.value))<0.05){
                        clin_erich<-clin_erich+1
                    }
                }else if(j == "gender" || j == "history_of_neoadjuvant_treatment"){
                    column<-c(column,colnames(factors)[which(apply(factors,MARGIN=2, function(x) wilcox.test(x~ordered.clinical.data[,j])$p.value)<0.05)])
                    pvalues<-c(pvalues,apply(factors,MARGIN=2, function(x) wilcox.test(x~ordered.clinical.data[,j])$p.value))
                    if(min(apply(factors,MARGIN=2, function(x) wilcox.test(x~ordered.clinical.data[,j])$p.value))<0.05){
                        clin_erich<-clin_erich+1
                    }
              }
            }
        }
        signif<-length(which(pvalues<0.05))
        if(clin_erich!=0){
            line<-rbind(line,clin_erich/signif)
        }else{
            line<-rbind(line,0)
        }
        line2<-rbind(line2,length(unique(column)))
        line3<-rbind(line3,clin_erich)

    }
                                 
    out<-list(selectivity=line,nnzero=line2,num.enriched=line3)
    return(out)
}

## Comparison based on survival prediction

Function testing significant association of a given set of factors to survival.

In [2]:
library('survival')

survival_comparison <- function(factorizations,method,survival,out.folder,cancer){
    factors_cancer<-numeric(0)
    surv_final<- numeric(0)
    
  
    for(i in 1:length(factorizations)){
        factors<-factorizations[[i]][[1]]
        #####match samples
        patient.names = rownames(factors)
        patient.names.in.file = as.character(survival[, 1])
        patient.names.in.file = toupper(gsub('-', '\\.', patient.names.in.file))
        if(all(patient.names %in% patient.names.in.file)==FALSE){
            w<-which(!is.na(match(patient.names,patient.names.in.file)))
            patient.names<-patient.names[w]
            factors<-factors[w,]
            rownames(factors)<-patient.names
        }
        indices = match(patient.names, patient.names.in.file)
        ordered.survival.data = survival[indices,]
        ordered.survival.data$Survival[is.na(ordered.survival.data$Survival)] = 0
        ordered.survival.data$Death[is.na(ordered.survival.data$Death)] = 0
        coxph_obj = coxph(Surv(ordered.survival.data$Survival, ordered.survival.data$Death) ~ factors)
        pvalues<-8*as.matrix(coef(summary(coxph_obj))[,5])
        factors_cancer<-c(factors_cancer,length(which( pvalues<0.05)))
        surv_final<-cbind(surv_final,pvalues) 
        
    }
    
    surv_final<-(-log10(surv_final))
    
    #plot survival pvalues for each cancer type separately
    png(file=paste("../results/survival_",cancer,sep=""),width = 15, height = 15, units = 'in', res = 200)
    matplot(1:length(method), t(surv_final),col="black",pch=18,xlab="Method", ylab="Pvalues survival",xaxt="none",cex=1.5)
    abline(h = (-log10(0.05)), v = 0, col = "black", lty=3, lwd=3)
    axis(1, at=1:length(method), labels=colnames( surv_final)) 
    dev.off()

    return(factors_cancer)
}

## Comparison based on association to biological annotations (REACTOME, Hallmarks, GO)

In [3]:
library("fgsea")

bioligcal_comparison <- function(factorizations,path.database,pval.thr=0.05,num.factors){
    pathways <- gmtPathways(path.database)
    
    ###create vector of outputs
    report_number<-numeric(0)
    report_nnzero<-numeric(0)
    report_select<-numeric(0)
    
    for(i in 1:length(factorizations)){
          
        metagenes<-factorizations[[i]][[2]][[1]]
        colnames(metagenes)<-1:num.factors
        rownames(metagenes)<-gsub("\\|",".",rownames(metagenes))
        rownames(metagenes)<-gsub("\\..*","",rownames(metagenes))
  
 
        ### bio enrichment  
        col2<-numeric(0)
        path<-numeric(0)
        n<-0
        for(j in 1:num.factors){
            rnk <- setNames(as.matrix(metagenes[,j]), rownames(metagenes))
            fgseaRes <- fgsea(pathways, rnk, minSize=15, maxSize=500, nperm=1000)
            if(sum(fgseaRes[, padj < pval.thr])!=0){
                n<-n+1
                col2<-rbind(col2,min(fgseaRes[which(fgseaRes[, padj < pval.thr]),]$padj))
                path<-rbind(path,as.matrix(fgseaRes[which(fgseaRes[, padj < pval.thr]),1]))
            }else{
                col2<-rbind(col2,NA)
            }
        }


        if(length(path)==0){
            report_number<-rbind(report_number,0)
        }else{
            report_number<-rbind(report_number,length(unique(path)))
        }
        
        if(length(unique(path))==0){
            report_select<-rbind(report_select,NA)
        }else{
            report_select<-rbind(report_select,length(path)/(length(unique(path))))
        }
        report_nnzero<-rbind(report_nnzero,n)    
    
    }
    out<-list(selectivity=report_select,nnzero=report_nnzero,num.enriched=report_number)
    return(out)
}

Loading required package: Rcpp


## Running the comparison in cancer

The cancer data should be organized into the data folder, each of them having the name of a different cancer type and containin the various omics data (3 omiocs for our test cases).

In [4]:
#charging the function running the factorization
source("runfactorization.R")

#list of cancer folder data should be organized as discussed above
cancers<-list.dirs(path = "../data/cancer", full.names = TRUE, recursive = TRUE)
cancers<-cancers[2:length(cancers)]

#database for biological enrichment
path.database<-"../data/biological_annotation/c2.cp.reactome.v6.2.symbols.gmt" #REACTOME
#path.database<-"../data/biological_annotation/h.all.v6.2.symbols.gmt" #Hallmarks
#path.database<-"../data/biological_annotation/c5.all.v6.2.symbols.gmt" #GO

#number of factors used in the paper
num.factors<-10

clinical_selectivity<-numeric(0)
clinical_nnzero<-numeric(0)
clinical_num.enriched<-numeric(0)
bio_selectivity<-numeric(0)
bio_nnzero<-numeric(0)
bio_num.enriched<-numeric(0)
cancer.list<-numeric(0)

###clinical categories for clinical test
col<-c( "age_at_initial_pathologic_diagnosis",
        "gender",
        "days_to_new_tumor_event_after_initial_treatment",
        "history_of_neoadjuvant_treatment")


for(i in cancers){
    
        cancer.list<-c(cancer.list,gsub('\\.\\./data/cancer/', '', i))
        out<-runfactorization(i,c("exp","methy","mirna"),num.factors,sep=" ",filtering="sd")
        
        ###survival analysis
        survival<-as.data.frame(read.table(paste(i,"/survival",sep=""),sep="\t",header=T)) 
        out_survival<-survival_comparison(out$factorizations,out$method,survival,"../results",gsub('\\.\\./data/', '', i))
    
        ### clinical analysis
        clinical<-as.data.frame(read.table(paste("../data/cancer/clinical/",gsub('\\.\\./data/', '', i),sep=""),sep="\t",header=T))
        out_clinical<-clinical_comparison(out$factorizations,clinical,col)   
        clinical_selectivity<-cbind(clinical_selectivity,out_clinical$selectivity)
        clinical_nnzero<-cbind(clinical_nnzero,out_clinical$nnzero)
        clinical_num.enriched<-cbind(clinical_num.enriched,out_clinical$num.enriched)
    
        ### biological analysis
        out_bio<-bioligcal_comparison(out$factorizations,path.database,pval.thr=0.05,num.factors)
        bio_selectivity<-cbind(bio_selectivity,out_bio$selectivity)
        bio_nnzero<-cbind(bio_nnzero,out_bio$nnzero)
        bio_num.enriched<-cbind(bio_num.enriched,out_bio$num.enriched)
}


colnames(clinical_selectivity)<-cancer.list
colnames(clinical_nnzero)<-cancer.list
colnames(clinical_num.enriched)<-cancer.list
colnames(bio_selectivity)<-cancer.list
colnames(bio_nnzero)<-cancer.list
colnames(bio_num.enriched)<-cancer.list

rownames(clinical_selectivity)<-out$methods
rownames(clinical_nnzero)<-out$methods
rownames(clinical_num.enriched)<-out$methods
rownames(bio_selectivity)<-out$methods
rownames(bio_nnzero)<-out$methods
rownames(bio_num.enriched)<-out$methods

write.table(clinical_selectivity,"../results/selectivity_clinical_annot.txt",sep="\t",col.names=T,row.names=T)
write.table(clinical_nnzero,"../results/nonzero_clinical_annot.txt",sep="\t",col.names=T,row.names=T)
write.table(clinical_num.enriched,"../results/number_enriched_clinical_annot.txt",sep="\t",col.names=T,row.names=T)
write.table(bio_selectivity,"../results/selectivity_bio_annot.txt",sep="\t",col.names=T,row.names=T)
write.table(bio_nnzero,"../results/nonzero_bio_annot.txt",sep="\t",col.names=T,row.names=T)
write.table(bio_num.enriched,"../results/number_enriched_bio_annot.txt",sep="\t",col.names=T,row.names=T)

Loading required package: MASS
Loading required package: NMF
Loading required package: pkgmaker
Loading required package: registry

Attaching package: ‘pkgmaker’

The following object is masked from ‘package:base’:

    isFALSE

Loading required package: rngtools
Loading required package: cluster
NMF - BioConductor layer [OK] | Shared memory capabilities [NO: bigmemory] | Cores 3/4
  To enable shared memory capabilities, try: install.extras('
NMF
')
Loading required package: mclust
Package 'mclust' version 5.4.3
Type 'citation("mclust")' for citing this R package in publications.
Loading required package: InterSIM
Loading required package: tools
Loading required package: ade4

Attaching package: ‘ade4’

The following object is masked from ‘package:BiocGenerics’:

    score


Attaching package: ‘GPArotation’

The following object is masked from ‘package:NMF’:

    entropy

“no function found corresponding to methods exports from ‘GenomicRanges’ for: ‘bindROWS’”
Attaching package: ‘MOFAt

ERROR: Error in file(file, "rt"): cannot open the connection
