# CyTOFmerge for imputation benchmarking

In [1]:
library(dplyr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [7]:
# revised CyTOFmerge function to combine two data.frames/matrices and handle edge cases
CombineFCS <- function(FCS1.data, FCS2.data, arcsinhTrans = FALSE, cofactor = 5,
                       k = 50, algorithm = "kd_tree") {
  # ensure numeric matrices/data.frames
  FCS1.data <- as.data.frame(FCS1.data); FCS2.data <- as.data.frame(FCS2.data)

  VarNames1 <- colnames(FCS1.data); VarNames2 <- colnames(FCS2.data)

  if (arcsinhTrans) {
    FCS1.data[] <- lapply(FCS1.data, function(x) if (is.numeric(x)) asinh(x / cofactor) else x)
    FCS2.data[] <- lapply(FCS2.data, function(x) if (is.numeric(x)) asinh(x / cofactor) else x)
  }

  # shared markers by name
  Matches <- integer(length(VarNames1))
  for (i in seq_along(VarNames1)) {
    hit <- which(VarNames1[i] == VarNames2)
    Matches[i] <- if (length(hit)) hit[1] else 0
  }

  Shared_Index <- which(Matches > 0L)
  m <- length(Shared_Index)
  if (m == 0L) stop("No shared markers found.")

  # reorder to [shared | non-shared]
  Data1.nonshared <- if (m < ncol(FCS1.data)) FCS1.data[, -Shared_Index, drop = FALSE] else NULL
  VarNames1.non   <- if (m < ncol(FCS1.data)) VarNames1[-Shared_Index] else character(0)

  Data2.nonshared <- if (m < ncol(FCS2.data)) FCS2.data[, -Matches[Shared_Index], drop = FALSE] else NULL
  VarNames2.non   <- if (m < ncol(FCS2.data)) VarNames2[-Matches[Shared_Index]] else character(0)

  F1_shared <- FCS1.data[, Shared_Index, drop = FALSE]
  F2_shared <- FCS2.data[, Matches[Shared_Index], drop = FALSE]

  FCS1.data <- if (!is.null(Data1.nonshared)) cbind(F1_shared, Data1.nonshared) else F1_shared
  VarNames1 <- c(colnames(F1_shared), VarNames1.non)

  FCS2.data <- if (!is.null(Data2.nonshared)) cbind(F2_shared, Data2.nonshared) else F2_shared
  VarNames2 <- c(colnames(F2_shared), VarNames2.non)

  n1 <- nrow(FCS1.data); n2 <- nrow(FCS2.data)
  p1 <- ncol(FCS1.data) - m
  p2 <- ncol(FCS2.data) - m

  # safe column ranges
  shared_cols <- seq_len(m)
  non1_cols   <- if (p1 > 0) (m + seq_len(p1)) else integer(0)
  non2_cols   <- if (p2 > 0) (m + seq_len(p2)) else integer(0)

  # KNN on shared block (cap k to available sample size)
  k1 <- max(1L, min(k, n2)); k2 <- max(1L, min(k, n1))
  alg1 <- if (n2 < 20L) "brute" else algorithm
  alg2 <- if (n1 < 20L) "brute" else algorithm

  IDX1 <- FNN::get.knnx(as.matrix(FCS2.data[, shared_cols, drop = FALSE]),
                        as.matrix(FCS1.data[, shared_cols, drop = FALSE]),
                        k = k1, algorithm = alg1)$nn.index
  IDX2 <- FNN::get.knnx(as.matrix(FCS1.data[, shared_cols, drop = FALSE]),
                        as.matrix(FCS2.data[, shared_cols, drop = FALSE]),
                        k = k2, algorithm = alg2)$nn.index

  # allocate combined: [shared | non1 | non2]
  Data.combine.1 <- matrix(NA_real_, nrow = n1, ncol = m + p1 + p2)
  Data.combine.2 <- matrix(NA_real_, nrow = n2, ncol = m + p1 + p2)
  VarNames.combine <- c(colnames(F1_shared), VarNames1.non, VarNames2.non)

  # fill shared & own non-shared
  Data.combine.1[, shared_cols] <- as.matrix(FCS1.data[, shared_cols, drop = FALSE])
  if (p1 > 0) Data.combine.1[, non1_cols] <- as.matrix(FCS1.data[, non1_cols, drop = FALSE])

  Data.combine.2[, shared_cols] <- as.matrix(FCS2.data[, shared_cols, drop = FALSE])
  if (p2 > 0) Data.combine.2[, m + p1 + seq_len(p2)] <- as.matrix(FCS2.data[, non2_cols, drop = FALSE])

  # impute missing blocks only if they exist
  if (p2 > 0) {
    X2_non <- as.matrix(FCS2.data[, non2_cols, drop = FALSE])
    for (i in seq_len(n1)) {
      neigh <- X2_non[IDX1[i, , drop = FALSE], , drop = FALSE]
      Data.combine.1[i, m + p1 + seq_len(p2)] <- apply(neigh, 2, stats::median, na.rm = TRUE)
    }
  }
  if (p1 > 0) {
    X1_non <- as.matrix(FCS1.data[, non1_cols, drop = FALSE])
    for (i in seq_len(n2)) {
      neigh <- X1_non[IDX2[i, , drop = FALSE], , drop = FALSE]
      Data.combine.2[i, non1_cols] <- apply(neigh, 2, stats::median, na.rm = TRUE)
    }
  }

  Data.combine <- rbind(Data.combine.1, Data.combine.2)
  colnames(Data.combine) <- VarNames.combine
  as.data.frame(Data.combine, check.names = FALSE)
}


In [3]:
# load data
expr  <- read.csv('/home/projects/amit/floriani/Lab/PROJECTS/FlowVI/data/2024-01-16_model_eval_multi_batch/2025-08-26_aurora_imputation_expression_scaled.csv')[,-1]
md  <- read.csv('/home/projects/amit/floriani/Lab/PROJECTS/FlowVI/data/2024-01-16_model_eval_multi_batch/2024-07-05_aurora_imputation_obs.csv')[, -1]

markers  <- colnames(expr)

In [4]:
expr[,'batch']  <- md$synth_batch
expr[,'sample']  <- md$sample_id

In [5]:
expr_batch1  <- expr %>% filter(batch == '0') %>% select(-batch, -sample)
expr_batch2  <- expr %>% filter(batch == '1') %>% select(-batch, -sample)

In [19]:
# loop over all markers and impute each one by leaving it out from batch 2
imputed_list  <- list()

for (mask_marker in markers){
    BB_marker  <- setdiff(markers, mask_marker)

    res  <- CombineFCS(expr_batch1, expr_batch2[,BB_marker], arcsinhTrans=FALSE)
    imputed_list[[mask_marker]]  <- res[(nrow(expr_batch1)+1):nrow(res), mask_marker]
}

imputed_df  <- do.call(cbind, imputed_list)

In [9]:
write.csv(imputed_df, file = '/home/projects/amit/floriani/Lab/PROJECTS/FlowVI/data/2024-01-16_model_eval_multi_batch/2025-08-26_aurora_imputation_CyTOFmerge_imputed.csv')