## Rank aggregation

Multiple lists of ranks have been obtained with network based proximity

In [1]:
library(TopKLists)

# Test data

In [2]:
data(breast)
head(breast)

Unnamed: 0_level_0,TransBig,MDCC,Pusztai
Unnamed: 0_level_1,<fct>,<fct>,<fct>
1,ESR1,ESR1,ESR1
2,TBC1D9,TBC1D9,TBC1D9
3,SCUBE2,EVL,SCUBE2
4,EVL,SCUBE2,FBP1
5,NAT2,CIRBP,EVL
6,BTG2,FBP1,RHOB


In [3]:
deltaplot.dir <- "test-deltaplot"
dir.create(deltaplot.dir, showWarnings = FALSE)
a <- deltaplot(breast, deltas = seq(0,300, by=5), directory=deltaplot.dir)

In [4]:
res <- j0.multi(breast, d=6, v=10)
res

list1,list2,v,j0_est,k,delta
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
TransBig,MDCC,10,9,8,6
TransBig,Pusztai,10,9,8,6
MDCC,TransBig,10,11,10,6
MDCC,Pusztai,10,13,12,6
Pusztai,TransBig,10,11,10,6
Pusztai,MDCC,10,15,14,6

TransBig_MDCC,TransBig_Pusztai,MDCC_TransBig,MDCC_Pusztai,Pusztai_TransBig,Pusztai_MDCC
1,1,1,1,1,1
1,1,1,1,1,1
1,1,1,1,1,1
1,1,1,1,1,1
0,0,1,1,1,1
1,1,1,1,0,1
0,0,1,1,0,1
1,1,0,1,1,1
1,1,0,0,0,1
0,0,1,1,1,1


# ChEMBL

In [5]:
fpath <- "../../results/2021-12-02-proximity-various-ADgenesets/ranked-chembl-ids.csv"
chembl_ids <- read.csv(fpath, as.is = TRUE)[-1]
#chembl_ids <- chembl_ids[1:3]
names(chembl_ids) <- sub("\\.", "", names(chembl_ids))
head(chembl_ids)

Unnamed: 0_level_0,knowledge,TWAS2,agora2,agora,DESudhir,ADDE.APOE3.APOE3,ADDE.APOE4.APOE4,APOE34.DE.neuron,APOE34.DE.astrocyte,APOE34.DE.microglia
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,CHEMBL1770916,CHEMBL1165,CHEMBL838,CHEMBL1522,CHEMBL2336071,CHEMBL1121,CHEMBL927,CHEMBL1909324,CHEMBL3545043,CHEMBL11662
2,CHEMBL520733,CHEMBL1168,CHEMBL317094,CHEMBL251940,CHEMBL3358920,CHEMBL1201185,CHEMBL1121,CHEMBL1318287,CHEMBL2386081,CHEMBL428
3,CHEMBL360328,CHEMBL577,CHEMBL1592,CHEMBL125,CHEMBL306043,CHEMBL262135,CHEMBL1823872,CHEMBL1139,CHEMBL83626,CHEMBL1401367
4,CHEMBL99946,CHEMBL1581,CHEMBL1168,CHEMBL334966,CHEMBL1096146,CHEMBL1823872,CHEMBL262135,CHEMBL46257,CHEMBL3828074,CHEMBL1255654
5,CHEMBL14370,CHEMBL317094,CHEMBL1581,CHEMBL398435,CHEMBL87992,CHEMBL525610,CHEMBL1201185,CHEMBL446452,CHEMBL426559,CHEMBL1200930
6,CHEMBL201960,CHEMBL838,CHEMBL1237,CHEMBL404520,CHEMBL314854,CHEMBL1536,CHEMBL1131,CHEMBL1193,CHEMBL1951143,CHEMBL1177


## Prepare top-$k$ lists

In [6]:
for (frac in c(0.2, 0.5, 1)) {
    deltaplot.dir <- paste0("chembl-deltaplot-", frac)
    dir.create(deltaplot.dir, showWarnings = FALSE)
    a <- deltaplot(chembl_ids, deltas = seq(0, frac * 1000, by=5), directory=deltaplot.dir, subset.lists = frac * nrow(chembl_ids))
}

The deltaplots show that $\delta$ varies between 100 and >300.  It looked to me that $\delta \approx 250$, so I set `d = 250` in the code below.

In [7]:
degeneration <- j0.multi(chembl_ids, d=250, v=100)
degeneration

list1,list2,v,j0_est,k,delta
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
knowledge,TWAS2,100,,,250
knowledge,agora2,100,,,250
knowledge,agora,100,,,250
knowledge,DESudhir,100,20,19,250
knowledge,ADDE.APOE3.APOE3,100,22,21,250
knowledge,ADDE.APOE4.APOE4,100,,,250
knowledge,APOE34.DE.neuron,100,,,250
knowledge,APOE34.DE.astrocyte,100,,,250
knowledge,APOE34.DE.microglia,100,,,250
TWAS2,knowledge,100,,,250

knowledge_TWAS2,knowledge_agora2,knowledge_agora,knowledge_DESudhir,knowledge_ADDE.APOE3.APOE3,knowledge_ADDE.APOE4.APOE4,knowledge_APOE34.DE.neuron,knowledge_APOE34.DE.astrocyte,knowledge_APOE34.DE.microglia,TWAS2_knowledge,⋯,APOE34.DE.astrocyte_APOE34.DE.microglia,APOE34.DE.microglia_knowledge,APOE34.DE.microglia_TWAS2,APOE34.DE.microglia_agora2,APOE34.DE.microglia_agora,APOE34.DE.microglia_DESudhir,APOE34.DE.microglia_ADDE.APOE3.APOE3,APOE34.DE.microglia_ADDE.APOE4.APOE4,APOE34.DE.microglia_APOE34.DE.neuron,APOE34.DE.microglia_APOE34.DE.astrocyte
0,1,0,0,0,0,0,0,1,0,⋯,0,0,1,0,0,0,0,0,1,0
0,1,0,0,0,0,0,0,1,0,⋯,0,0,1,0,0,0,0,0,1,0
0,0,1,0,1,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,1
0,0,0,1,1,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,1
0,0,0,1,1,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,1,1
0,0,0,1,1,0,0,0,0,0,⋯,0,1,0,0,0,1,0,0,0,1
0,0,0,1,1,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,1,1,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,1,1,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,1,1,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In the `agg.make.inputs` function the `fullSpace=TRUE` argument means that the space of the aggregated list is that of all 2413 ChEMBL drugs.  On the other hand, `fullSpace=FALSE` will set the space of the aggregated list (the "aggregate space") is the union of the space of the lists of base rankers.

This has an important implication: for `fullSPace=TRUE` the aggregate space is invariable while for `fullSPace=FALSE` it depends on which base rankers are taken into account.

In [8]:
agg.make.inputs <- function(chembl_ids, fullSpace=FALSE){
    # The common and space variables may need to be defined based on all drugs in the screen.
    # The fullSpace=TRUE argument expresses that case.
    input <- as.list(chembl_ids[1:degeneration$maxK, ])
    common <- if (fullSpace) chembl_ids[[1]] else unique(unlist(input))
    space <- rep(list(common), length(chembl_ids))
    names(space) <- names(chembl_ids)
    val <- list(input = input, space = space)
    return(val)
}

agg.inputs <- agg.make.inputs(chembl_ids, fullSpace = FALSE)
agg.inputs.full <- agg.make.inputs(chembl_ids, fullSpace = TRUE)

In [9]:
str(agg.inputs)

List of 2
 $ input:List of 10
  ..$ knowledge          : chr [1:110] "CHEMBL1770916" "CHEMBL520733" "CHEMBL360328" "CHEMBL99946" ...
  ..$ TWAS2              : chr [1:110] "CHEMBL1165" "CHEMBL1168" "CHEMBL577" "CHEMBL1581" ...
  ..$ agora2             : chr [1:110] "CHEMBL838" "CHEMBL317094" "CHEMBL1592" "CHEMBL1168" ...
  ..$ agora              : chr [1:110] "CHEMBL1522" "CHEMBL251940" "CHEMBL125" "CHEMBL334966" ...
  ..$ DESudhir           : chr [1:110] "CHEMBL2336071" "CHEMBL3358920" "CHEMBL306043" "CHEMBL1096146" ...
  ..$ ADDE.APOE3.APOE3   : chr [1:110] "CHEMBL1121" "CHEMBL1201185" "CHEMBL262135" "CHEMBL1823872" ...
  ..$ ADDE.APOE4.APOE4   : chr [1:110] "CHEMBL927" "CHEMBL1121" "CHEMBL1823872" "CHEMBL262135" ...
  ..$ APOE34.DE.neuron   : chr [1:110] "CHEMBL1909324" "CHEMBL1318287" "CHEMBL1139" "CHEMBL46257" ...
  ..$ APOE34.DE.astrocyte: chr [1:110] "CHEMBL3545043" "CHEMBL2386081" "CHEMBL83626" "CHEMBL3828074" ...
  ..$ APOE34.DE.microglia: chr [1:110] "CHEMBL11662" "CHEMBL428"

## List aggregation

In [13]:
#sel.cond <- names(chembl_ids)
sel.cond <- c("knowledge", "TWAS2", "APOE34.DE.neuron")
outBorda <- Borda(agg.inputs$input[sel.cond], agg.inputs$space[sel.cond])
outBorda.full <- Borda(agg.inputs.full$input[sel.cond], agg.inputs.full$space[sel.cond])
MC.start <- Sys.time()
outMC <- MC(agg.inputs$input[sel.cond], agg.inputs$space[sel.cond])
MC.stop <- Sys.time()
MC.full.start <- Sys.time()
outMC.full <- MC(agg.inputs.full$input[sel.cond], agg.inputs.full$space[sel.cond])
MC.full.stop <- Sys.time()

In [15]:
MC.stop - MC.start
MC.full.stop - MC.full.start

Time difference of 5.685871 secs

Time difference of 8.954499 secs

In [None]:
CEMC.start <- Sys.time()
outCEMC <- CEMC(agg.inputs$input[sel.cond], agg.inputs$space[sel.cond])
CEMC.stop <- Sys.time()
CEMC.full.start <- Sys.time()
outCEMC.full <- CEMC(agg.inputs.full$input[sel.cond], agg.inputs.full$space[sel.cond])
CEMC.full.stop <- Sys.time()

In [None]:
CEMC.stop - CEMC.start
CEMC.full.stop - CEMC.full.start