# Comparison performances in simulated data

Here the performances of the 9 multi-omics factorization methods are compared based on their performances on simulated data.

## Construction of the simulated data

The function to produce the simulated data is here defined. The function generates multi-omics simulated data (expression, methylation and protein quantification) composed of 100 samples, using the CRAN InserSIM package. Two different scenarios are considered, equally spaced clusters and heterogeneous ones. Moreover,different numbers of clusters are imposed on the data: 5,10,15. 

In [2]:
library("InterSIM")
source("runfactorization.R")

In [9]:
dir.create("../data")
dir.create("../data/simulations")
dir.create("../results")

“'../results' already exists”

In [4]:
## INPUTS:
#folder = location  where the simulated data should be saved
#num.clusters = number of clusters to be imposed on the data
#size = heterogeneous for heterogeneous clusters, equal for equally-sized clusters

## OUPUTS: matrices of simulated data are saved to file in folder

simulated_data_generation <- function(out.folder, num.clusters, size="heterogeneous") {
    
    num.clusters <- as.numeric(num.clusters)
    
    ###the proportions of samples per clusters here defined are those used for the paper
    if (size=="heterogeneous") {
       if (num.clusters==5) {
           prop <- c(0.35, 0.13, 0.19, 0.08, 0.25)
       } else if (num.clusters==10) {
           prop <- c(0.20, 0.10, 0.07, 0.10, 0.15, 0.13, 0.10, 0.08, 0.05, 0.02)
       } else {
           prop <- c(0.10,0.08,0.04,0.03,0.12,0.03,0.10,0.03,0.05,0.02,0.1,0.2,0.03,0.02,0.05)
       }
    } else if (size=="equal") {
       if (num.clusters==5) {
           prop <- c(0.25,0.2,0.2,0.2,0.15)
       } else if (num.clusters==10) {
           prop <- c(0.15,0.1,0.1,0.10,0.1,0.1,0.05,0.1,0.1,0.1)
       } else {
           prop <- c(0.07,0.07,0.07,0.06,0.07,0.07,0.07,0.06,0.07,0.06,0.07,0.06,0.07,0.06,0.07)
       } 
    } else {
        print("ERROR: size can only assume value heterogeneous or equal")
    }
    
    effect <- 2.5

    sim.D <- InterSIM(n.sample=100, cluster.sample.prop=prop, delta.methyl=effect,
              delta.expr=effect, delta.protein=effect, p.DMP=0.25, p.DEG=NULL, p.DEP=NULL,
              do.plot=FALSE, sample.cluster=TRUE, feature.cluster=TRUE)

    write.table(sim.D$clustering.assignment, paste(out.folder, "clusters.txt", sep="/"), sep="\t")
    write_table_with_index_header(t(sim.D$dat.methyl), paste(out.folder, "omics1.txt", sep="/"))
    write_table_with_index_header(t(sim.D$dat.expr), paste(out.folder, "omics2.txt", sep="/"))
    write_table_with_index_header(t(sim.D$dat.protein), paste(out.folder, "omics3.txt", sep="/"))

    return("data saved in folder")
}

write_table_with_index_header <- function(data, file, title="probe", sep="\t") {
    cat(title, sep, file=file)
    write.table(data, file, sep=sep, append=T)
}

Here the performances of the 9 multi-omics factorization methods are compared on simulated data. 

## Comparison based on clustering assignment

The function comparing their performances in terms of clustering assignment is here defined. The function computes the Jaccard Index of intersection between the clusters predicted from factorization versus ground truth imposed in simulations. The cluster assignment is computed from the factors by kmeans clustering.

In [5]:
## INPUTS: 
#factorizations= list of factorization results
#methods= list of ran factorization methods
#in.folder= path to input folder
#out.folder= path to output folder
#icluster.clusters= clustering result of iCluster
#intNMF.clusters= clustering result of intNMF
#number_cl= number of clusters

## OUPUTS: matrix of Jaccard Indexes is saved to table in out.folder

clusters_comparison <- function(factorizations,methods,in.folder,out.folder,icluster.clusters,intNMF.clusters,number_cl) {

    ind <- 0
    JI_final <- matrix(data=NA, nrow=number_cl, ncol=length(factorizations))

    ##charging clusters imposed on simulated data 
    cl  <- as.matrix(read.table(paste(in.folder,"clusters.txt",sep=""),sep="\t",row.names=1,header=T))
    cl2 <- as.matrix(as.numeric(cl[,2]))
    rownames(cl2) <- cl[,1]

    ## Assigning samples to cluster based on factors
    for (i in 1:length(factorizations)) {
        if (methods[i]=="iCluster" || methods[i]=="intNMF") {
            if(methods[i]=="iCluster") {
                clust_iCluster <- as.matrix(icluster.clusters)
            } else {
                clust_iCluster <- as.matrix(intNMF.clusters)
            }
            ######creation sets of samples
            JI_mat <- numeric(0)
            for (p in 1:number_cl) {
                x1 <- rownames(clust_iCluster)[which(clust_iCluster[,1]==p)]
                row <- numeric(0)
                for (j in 1:number_cl) {
                    x2  <- rownames(cl2)[which(cl2[,1]==j)]
                    I   <- length(intersect(x1,x2))
                    S   <- I/(length(x1)+length(x2)-I)
                    row <- cbind(row,S)
                }
                JI_mat <- rbind(JI_mat,row)
            }
            JI_final[1:number_cl,i]<-apply(JI_mat,1,max)
            
        } else {
            factors <- factorizations[[i]][[1]]
            ###clustering by Kmeans
            JI_good <- numeric(0)
            for (run in 1:1000) {
                kmeans.out <- kmeans(factors, centers=number_cl)
                kmeans.out
                clust_iCluster <- as.matrix(kmeans.out$cluster)
                ######creation sets of samples
                JI_mat <- numeric(0)
                for (p in 1:number_cl) {
                    x1  <- rownames(clust_iCluster)[which(clust_iCluster[,1]==p)]
                    row <- numeric(0)
                    for(j in 1:number_cl) {
                        x2  <- rownames(cl2)[which(cl2[,1]==j)]
                        I   <- length(intersect(x1,x2))
                        S   <- I/(length(x1)+length(x2)-I)
                        row <- cbind(row,S)
                     }
                     JI_mat <- rbind(JI_mat,row)
                 }
                 JI_good <- rbind(JI_good,apply(JI_mat,1,max))
            }
              JI_final[1:number_cl,i] <- apply(JI_good,2,mean)
        }
    }
    colnames(JI_final) <- methods
    write.table(JI_final, paste(out.folder,"/JI.txt",sep=""), sep="\t", row.names=F, col.names=T)
}

## Running the comparison

In [11]:
#charging the function running the factorization
#for(num.clusters in c(5,10,15)) {
for(num.clusters in c(5)) {
    for (size in c("heterogeneous","equal")) {
        simulated_data_generation("../data/simulations", num.clusters,size)
        out <- runfactorization("../data/simulations", c("omics1.txt","omics2.txt","omics3.txt"), num.clusters, sep="\t", filtering="none")
        folder <- paste("../results/", num.clusters, "_", size,sep="")
        dir.create(folder)
        clusters_comparison(out$factorizations,out$method,"../data/simulations/",folder,out$icluster.clusters,out$intNMF.clusters,num.clusters)
    }
}

“View names are not specified in the data, renaming them to: view_1, view_2, view_3
”Creating MOFA object from list of matrices,
 please make sure that samples are columns and features are rows...

Checking data options...
Checking training options...
Checking model options...


[1] "No output file provided, using a temporary file..."
Generating warm start... 
K=6:123


“View names are not specified in the data, renaming them to: view_1, view_2, view_3
”Creating MOFA object from list of matrices,
 please make sure that samples are columns and features are rows...

Checking data options...
Checking training options...
Checking model options...


[1] "No output file provided, using a temporary file..."
Generating warm start... 
K=6:1234


## Generating plots with results

In [12]:
# Save all boxplots in a single PDF output file
pdf(file="../results/simulated_boxplots.pdf", width = 15, height = 15)

#for (i in c(5,10,15)) {
for (i in c(5)) {
    JI.eq  <- as.matrix(read.table(paste("../results/",i,"_equal/","JI.txt",sep=""),sep="\t",header=T))
    JI.het <- as.matrix(read.table(paste("../results/",i,"_heterogeneous/","JI.txt",sep=""),sep="\t",header=T))
    colnames(JI.eq) <- paste(colnames(JI.eq), "_EQ", sep="")
    
    JI.final <- numeric(0)
    col <- numeric(0)
    for (j in 1:dim(JI.eq)[2]) {
        JI.final <- cbind(JI.final,JI.het[,j],JI.eq[,j])
        col <- rbind(col,colnames(JI.het)[j],colnames(JI.eq)[j])
    }
    colnames(JI.final)<-col
    
    par(mar=c(25,4,2,2)+.1)
    p <- boxplot(JI.final,xaxt="none",cex.axis=3.5,col=c('gray','gray','red','red','blue','blue','blueviolet','blueviolet','deeppink','deeppink','chocolate1','chocolate1','darkgoldenrod','darkgoldenrod','green','green','darkturquoise','darkturquoise'),ann=FALSE,outline=FALSE)
    matplot(1:dim(JI.final)[2], t(JI.final),col="black",pch=16,xaxt="none",cex=0.8,add=TRUE)
    axis(1, at=1:dim(JI.final)[2], labels=colnames(JI.final),las=2,srt=45, cex=0.8,cex.lab=3.5, cex.axis=3.5, cex.main=1.5, cex.sub=1.5) 
    title(main=paste(i,"clusters",sep=" "),cex.lab=0.75, line = -2.5, adj=0,cex.main=3.5)
}
dev.off()