# Comparison performances in simulated data

Here the performances of the 9 multi-omics factorization methods are compared based on their performances on simulated data.

## Construction of the simulated data

The function to produce the simulated data is here defined. The function generates multi-omics simulated data (expression, methylation and protein quantification) composed of 100 samples, using the CRAN InserSIM package. Two different scenarios are considered, equally spaced clusters and heterogeneous ones. Moreover,different numbers of clusters are imposed on the data: 5,10,15. 

In [1]:
library("InterSIM")

## INPUTS:
#folder = location  where the simulated data should be saved
#num.clusters = number of clusters to be imposed on the data
#size = heterogeneous for heterogeneous clusters, equal for equally-sized clusters

## OUPUTS: matrices of simulated data are saved to file in folder

simulated_data_generation <- function(out.folder,num.clusters,size="heterogeneous"){
    
    num.clusters<-as.numeric(num.clusters)
    
    ###the proportions of samples per clusters here defined are those used for the paper
    if(size=="heterogeneous"){
       if(num.clusters==5){
           prop <- c(0.35,0.13,0.19,0.08,0.25)
       }else if(num.clusters==10){
           prop <- c(0.20,0.10,0.07,0.10,0.15,0.13,0.10,0.08,0.05,0.02)
       }else{
           prop <- c(0.10,0.08,0.04,0.03,0.12,0.03,0.10,0.03,0.05,0.02,0.1,0.2,0.03,0.02,0.05)
       }
    }else if(size=="equal"){
       if(num.clusters==5){
           prop <- c(0.2,0.2,0.2,0.2,0.2)
       }else if(num.clusters==10){
           prop <- c(0.1,0.1,0.1,0.10,0.1,0.1,0.1,0.1,0.1,0.1)
       }else{
           prop <- c(0.07,0.07,0.07,0.06,0.07,0.07,0.07,0.06,0.07,0.06,0.07,0.06,0.07,0.06,0.07)
       } 
    }else{
        disp("ERROR: size can only assume value heterogeneous or equal")
    }
    
    effect <- 2.5

    sim.D <- InterSIM(n.sample=100,cluster.sample.prop=prop,delta.methyl=effect,
    delta.expr=effect,delta.protein=effect,p.DMP=0.25,p.DEG=NULL,p.DEP=NULL,
    do.plot=FALSE, sample.cluster=TRUE, feature.cluster=TRUE)

    dat1 <- t(sim.D$dat.methyl)
    dat2 <- t(sim.D$dat.expr)
    dat3 <- t(sim.D$dat.protein)

    true.cluster.assignment <- sim.D$clustering.assignment
    

    write.table(dat1,paste(out.folder,"/omics1.txt",sep=""),sep="\t",row.names=T,col.names=T)
    write.table(dat2,paste(out.folder,"/omics2.txt",sep=""),sep="\t",row.names=T,col.names=T)
    write.table(dat3,paste(out.folder,"/omics3.txt",sep=""),sep="\t",row.names=T,col.names=T)
    write.table(true.cluster.assignment,paste(out.folder,"/clusters.txt",sep=""),sep="\t",row.names=T,col.names=T)
    
    disp("data saved in folder")
}

Loading required package: MASS
Loading required package: NMF
Loading required package: pkgmaker
Loading required package: registry

Attaching package: ‘pkgmaker’

The following object is masked from ‘package:base’:

    isFALSE

Loading required package: rngtools
Loading required package: cluster
NMF - BioConductor layer [OK] | Shared memory capabilities [NO: bigmemory] | Cores 3/4
  To enable shared memory capabilities, try: install.extras('
NMF
')
Loading required package: tools


Here the performances of the 9 multi-omics factorization methods are compared on simulated data. 

## Comparison based on clustering assignment

The function comparing their performances in terms of clustering assignment is here defined. The function computes the Jaccard Index of intersection between the clusters predicted from factorization versus ground truth imposed in simulations. The cluster assignment is computed from the factors by kmeans clustering.

In [2]:
## INPUTS: 
#factorizations= list of factorization results
#methods= list of ran factorization methods
#in.folder= path to input folder
#out.folder= path to output folder
#icluster.clusters= clustering result of iCluster
#intNMF.clusters= clustering result of intNMF
#number_cl= number of clusters

## OUPUTS: matrix of Jaccard Indexes is saved to table in out.folder

clusters_comparison <- function(factorizations,methods,in.folder,out.folder,icluster.clusters,intNMF.clusters,number_cl){

    ind<-0
    JI_final<- matrix(data=NA,nrow=15,ncol=27)

    ##charging clusters imposed on simulated data 
    cl<-as.matrix(read.table(paste(in.folder,"clusters.txt",sep=""),sep="\t",row.names=1,header=T))
    cl2<-as.matrix(as.numeric(cl[,2]))
    rownames(cl2)<-cl[,1]

    ## Assigning samples to cluster based on factors
    for(i in 1:length(factorizations)){
        if(methods[i]=="iCluster" || methods[i]=="intNMF"){
            if(methods[i]=="iCluster"){
                clust_iCluster<-as.matrix(icluster.clusters)
            }else{
                clust_iCluster<-as.matrix(intNMF.clusters)
            }
            ######creation sets of samples
            JI_mat<-numeric(0)
            for (i in 1:number_cl){
                x1<-rownames(clust_iCluster)[which(clust_iCluster[,1]==i)]
                row<-numeric(0)
                for(j in 1:number_cl){
                    x2<-rownames(cl2)[which(cl2[,1]==j)]
                    I <- length(intersect(x1,x2))
                    S <- I/(length(x1)+length(x2)-I)
                    row<-cbind(row,S)
                }
                JI_mat<-rbind(JI_mat,row)
            }
            JI_final[1:number_cl,i]<-apply(JI_mat,1,max)
            
        }else{
            factors<-factorizations[[i]][[1]]
            ###clustering by Kmeans
            JI_good<-numeric(0)
            for (run in 1:1000){
                kmeans.out <- kmeans(factors, centers=number_cl)
                clust_iCluster<-as.matrix(kmeans.out$cluster)
                ######creation sets of samples
                JI_mat<-numeric(0)
                for (i in 1:number_cl){
                    x1<-rownames(clust_iCluster)[which(clust_iCluster[,1]==i)]
                    row<-numeric(0)
                    for(j in 1:number_cl){
                        x2<-rownames(cl2)[which(cl2[,1]==j)]
                        I <- length(intersect(x1,x2))
                        S <- I/(length(x1)+length(x2)-I)
                        row<-cbind(row,S)
                     }
                     JI_mat<-rbind(JI_mat,row)
                 }
                 JI_good<-rbind(JI_good,apply(JI_mat,1,max))
            }
              JI_final[1:number_cl,i]<-apply(JI_good,2,mean)
        }
    }
    
    colnames(JI_final)<-methods
    write.table(JI_final,paste(out.folder,"/JI.txt",sep=""),sep="\t",row.names=T,col.names=T)
}

## Comparison based on metagenes correlation


## Running the comparison

In [None]:
#charging the function running the factorization
source("runfactorization.R")

for(num.clusters in c(5,10,15)){
    for(size in c("heterogeneous","equal")){
        simulated_data_generation("../data/simulations",num.clusters,size)
        out<-runfactorization("../data/simulations",c("omics1.txt","omics2.txt","omics3.txt"),num.clusters,sep="\t",filtering="none")
        clusters_comparison(out$factorizations,out$methods,"../data/simulations","../results",out$icluster.clusters,out$intNMF.clusters,number_cl)
    
    }
}