In [1]:
suppressPackageStartupMessages({
  library(DuoClustering2018)
})

Using temporary cache /tmp/RtmpXb02iC/BiocFileCache

snapshotDate(): 2019-10-22

Using temporary cache /tmp/RtmpXb02iC/BiocFileCache

Using temporary cache /tmp/RtmpXb02iC/BiocFileCache



In [2]:
## Apply Seurat

suppressPackageStartupMessages({
  library(Seurat)
})

apply_Seurat <- function(sce, params, resolution) {
  (seed <- round(1e6*runif(1)))
  tryCatch({
    dat <- counts(sce)
    st <- system.time({
      data <- CreateSeuratObject(raw.data = dat, min.cells = params$min.cells,
                                 min.genes = params$min.genes, project = "scRNAseq", 
                                 display.progress = FALSE) 
      data <- NormalizeData(object = data, normalization.method = "LogNormalize", 
                            scale.factor = 1e4, display.progress = FALSE)
      data <- ScaleData(object = data, display.progress = FALSE)
      data <- RunPCA(object = data, pc.genes = rownames(data@data), do.print = FALSE, 
                     pcs.compute = max(params$dims.use), seed.use = seed)
      data <- FindClusters(object = data, reduction.type = "pca", save.SNN = TRUE, 
                           dims.use = params$dims.use, k.param = 30,
                           resolution = resolution, print.output = 0, 
                           random.seed = seed)
      cluster <- data@ident
    })
    
    st <- c(user.self = st[["user.self"]], sys.self = st[["sys.self"]], 
            user.child = st[["user.child"]], sys.child = st[["sys.child"]],
            elapsed = st[["elapsed"]])
    list(st = st, cluster = cluster, est_k = NA)
  }, error = function(e) {
    list(st = c(user.self = NA, sys.self = NA, user.child = NA, sys.child = NA,
                elapsed = NA), 
         cluster = structure(rep(NA, ncol(sce)), names = colnames(sce)),
         est_k = NA)
  })
}

In [3]:
## Apply SC3

suppressPackageStartupMessages({
  library(scater)
  library(SC3)
})

apply_SC3 <- function(sce, params, k) {
  (seed <- round(1e6*runif(1)))
  tryCatch({
    rowData(sce)$feature_symbol <- rownames(counts(sce))
    st1 <- system.time({
      dat <- sc3_prepare(sce, gene_filter = params$gene_filter, 
                         pct_dropout_min = params$pct_dropout_min, 
                         pct_dropout_max = params$pct_dropout_max, 
                         svm_max = 1e6, n_cores = 1, rand_seed = seed)
    })
    est_k <- metadata(sc3_estimate_k(dat))$sc3$k_estimation
    st2 <- system.time({
      dat <- sc3(dat, ks = k, pct_dropout_min = params$pct_dropout_min,
                 pct_dropout_max = params$pct_dropout_max,
                 gene_filter = params$gene_filter, rand_seed = seed, n_cores = 1,
                 biology = FALSE, k_estimator = FALSE, svm_max = 1e6)
      cluster <- as.numeric(colData(dat)[, paste0("sc3_", k, "_clusters")])
      names(cluster) <- rownames(colData(dat))
    })
    
    st <- st1 + st2
    st <- c(user.self = st[["user.self"]], sys.self = st[["sys.self"]], 
            user.child = st[["user.child"]], sys.child = st[["sys.child"]],
            elapsed = st[["elapsed"]])
    list(st = st, cluster = cluster, est_k = est_k)
  },
  error = function(e) {
    list(st = c(user.self = NA, sys.self = NA, user.child = NA, sys.child = NA,
                elapsed = NA), 
         cluster = structure(rep(NA, ncol(sce)), names = colnames(sce)),
         est_k = NA)
  })
}

In [36]:
"""
suppressPackageStartupMessages({
  library(ExperimentHub)
})
eh <- ExperimentHub()
eh_duo=query(eh, "DuoClustering2018")
"""

In [42]:
slotNames(eh_duo)

In [44]:
sce_filteredM3Drop10_Koh()

Using temporary cache /tmp/RtmplSzqSv/BiocFileCache

snapshotDate(): 2019-10-22

Using temporary cache /tmp/RtmplSzqSv/BiocFileCache

see ?DuoClustering2018 and browseVignettes('DuoClustering2018') for documentation

Using temporary cache /tmp/RtmplSzqSv/BiocFileCache

Using temporary cache /tmp/RtmplSzqSv/BiocFileCache

downloading 1 resources

retrieving 1 resource

Using temporary cache /tmp/RtmplSzqSv/BiocFileCache

“download failed
  web resource path: ‘https://experimenthub.bioconductor.org/fetch/1503’
  local file path: ‘/tmp/RtmplSzqSv/BiocFileCache/2ddb93e17fa44_1503’
  reason: Internal Server Error (HTTP 500).”
“bfcadd() failed; resource removed
  rid: BFC5
  fpath: ‘https://experimenthub.bioconductor.org/fetch/1503’
  reason: download failed”
“download failed
  hub path: ‘https://experimenthub.bioconductor.org/fetch/1503’
  cache resource: ‘EH1503 : 1503’


ERROR: Error: failed to load resource
  name: EH1503
  title: sce_filteredM3Drop10_Koh
  reason: 1 resources failed to download


In [43]:
eh_duo

ExperimentHub with 122 records
# snapshotDate(): 2019-10-22 
# $dataprovider: Robinson group (UZH), 10x Genomics, Zheng et al (2017), SRA...
# $species: Homo sapiens, Mus musculus, NA
# $rdataclass: data.frame, SingleCellExperiment, list
# additional mcols(): taxonomyid, genome, description,
#   coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
#   rdatapath, sourceurl, sourcetype 
# retrieve records with, e.g., 'object[["EH1499"]]' 

           title                                               
  EH1499 | duo_clustering_all_parameter_settings_v1            
  EH1500 | sce_full_Koh                                        
  EH1501 | sce_filteredExpr10_Koh                              
  EH1502 | sce_filteredHVG10_Koh                               
  EH1503 | sce_filteredM3Drop10_Koh                            
  ...      ...                                                 
  EH1651 | clustering_summary_filteredHVG10_SimKumar4hard_v2   
  EH1652 | clustering_summary_

In [None]:
sce_full_Koh(metadata = FALSE)
sce_filteredExpr10_Koh(metadata = FALSE)
sce_filteredHVG10_Koh(metadata = FALSE)

In [None]:
sce_full_Kumar(metadata = FALSE)
sce_filteredExpr10_Kumar(metadata = FALSE)
sce_filteredHVG10_Kumar(metadata = FALSE)

In [None]:
sce_full_SimKumar4easy(metadata = FALSE)
sce_filteredExpr10_SimKumar4easy(metadata = FALSE)
sce_filteredHVG10_SimKumar4easy(metadata = FALSE)

sce_full_SimKumar4hard(metadata = FALSE)
sce_filteredExpr10_SimKumar4hard(metadata = FALSE)
sce_filteredHVG10_SimKumar4hard(metadata = FALSE)

sce_full_SimKumar8hard(metadata = FALSE)
sce_filteredExpr10_SimKumar8hard(metadata = FALSE)
sce_filteredHVG10_SimKumar8hard(metadata = FALSE)

In [None]:
sce_full_Trapnell(metadata = FALSE)
sce_filteredExpr10_Trapnell(metadata = FALSE)
sce_filteredHVG10_Trapnell(metadata = FALSE)

In [None]:
sce_full_Zhengmix4eq(metadata = FALSE)
sce_filteredExpr10_Zhengmix4eq(metadata = FALSE)
sce_filteredHVG10_Zhengmix4eq(metadata = FALSE)

sce_full_Zhengmix4uneq(metadata = FALSE)
sce_filteredExpr10_Zhengmix4uneq(metadata = FALSE)
sce_filteredHVG10_Zhengmix4uneq(metadata = FALSE)

sce_full_Zhengmix8eq(metadata = FALSE)
sce_filteredExpr10_Zhengmix8eq(metadata = FALSE)
sce_filteredHVG10_Zhengmix8eq(metadata = FALSE)

In [5]:
#clustering_summary_filteredExpr10_Koh_v2()
res <- clustering_summary_filteredExpr10_Koh_v2()

Using temporary cache /tmp/RtmpXb02iC/BiocFileCache

snapshotDate(): 2019-10-22

Using temporary cache /tmp/RtmpXb02iC/BiocFileCache

see ?DuoClustering2018 and browseVignettes('DuoClustering2018') for documentation

Using temporary cache /tmp/RtmpXb02iC/BiocFileCache

Using temporary cache /tmp/RtmpXb02iC/BiocFileCache

Using temporary cache /tmp/RtmpXb02iC/BiocFileCache

loading from cache

Using temporary cache /tmp/RtmpXb02iC/BiocFileCache

Using temporary cache /tmp/RtmpXb02iC/BiocFileCache

Using temporary cache /tmp/RtmpXb02iC/BiocFileCache



In [6]:
slotNames(res)

In [12]:
write.table(res,'temp.tsv',sep='\t')

dataset,method,cell,run,k,resolution,cluster,trueclass,est_k,elapsed
<chr>,<chr>,<chr>,<int>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>
sce_filteredExpr10_Koh,PCAKmeans,SRR3952323,1,2,,1,H7hESC,,14.318
sce_filteredExpr10_Koh,PCAKmeans,SRR3952325,1,2,,1,H7hESC,,14.318
sce_filteredExpr10_Koh,PCAKmeans,SRR3952326,1,2,,1,H7hESC,,14.318
sce_filteredExpr10_Koh,PCAKmeans,SRR3952327,1,2,,1,H7hESC,,14.318
sce_filteredExpr10_Koh,PCAKmeans,SRR3952328,1,2,,1,H7hESC,,14.318
sce_filteredExpr10_Koh,PCAKmeans,SRR3952329,1,2,,1,H7hESC,,14.318
sce_filteredExpr10_Koh,PCAKmeans,SRR3952330,1,2,,1,H7hESC,,14.318
sce_filteredExpr10_Koh,PCAKmeans,SRR3952331,1,2,,1,H7hESC,,14.318
sce_filteredExpr10_Koh,PCAKmeans,SRR3952337,1,2,,1,H7hESC,,14.318
sce_filteredExpr10_Koh,PCAKmeans,SRR3952341,1,2,,1,H7hESC,,14.318


In [None]:
sce <- sce_filteredExpr10_Koh()

In [14]:
scename <- "sce_filteredExpr10_Koh"
method <- "PCAHC"

In [15]:
params_all=duo_clustering_all_parameter_settings_v2()

Using temporary cache /tmp/RtmplSzqSv/BiocFileCache

snapshotDate(): 2019-10-22

Using temporary cache /tmp/RtmplSzqSv/BiocFileCache

see ?DuoClustering2018 and browseVignettes('DuoClustering2018') for documentation

Using temporary cache /tmp/RtmplSzqSv/BiocFileCache

Using temporary cache /tmp/RtmplSzqSv/BiocFileCache

Using temporary cache /tmp/RtmplSzqSv/BiocFileCache

loading from cache

Using temporary cache /tmp/RtmplSzqSv/BiocFileCache

Using temporary cache /tmp/RtmplSzqSv/BiocFileCache

Using temporary cache /tmp/RtmplSzqSv/BiocFileCache



In [16]:
params <- params_all[[paste0(scename, "_",method)]]

In [17]:
params

In [24]:
#res

In [20]:
## Set number of times to run clustering for each k
n_rep <- 5

## Run clustering
set.seed(1234)
L <- lapply(seq_len(n_rep), function(i) {  ## For each run
  cat(paste0("run = ", i, "\n"))
  if (method == "Seurat") {
    tmp <- lapply(params$range_resolutions, function(resolution) {  
      ## For each resolution
      cat(paste0("resolution = ", resolution, "\n"))
      ## Run clustering
      res <- get(paste0("apply_", method))(sce = sce, params = params, 
                                           resolution = resolution)
      
      ## Put output in data frame
      df <- data.frame(dataset = scename, 
                       method = method, 
                       cell = names(res$cluster),
                       run = i,
                       k = length(unique(res$cluster)),
                       resolution = resolution,
                       cluster = res$cluster,
                       stringsAsFactors = FALSE, row.names = NULL)
      tm <- data.frame(dataset = scename, 
                       method = method,
                       run = i, 
                       k = length(unique(res$cluster)),
                       resolution = resolution,
                       user.self = res$st[["user.self"]],
                       sys.self = res$st[["sys.self"]],
                       user.child = res$st[["user.child"]],
                       sys.child = res$st[["sys.child"]],
                       elapsed = res$st[["elapsed"]],
                       stringsAsFactors = FALSE, row.names = NULL)
      kest <- data.frame(dataset = scename, 
                         method = method,
                         run = i, 
                         k = length(unique(res$cluster)),
                         resolution = resolution,
                         est_k = res$est_k,
                         stringsAsFactors = FALSE, row.names = NULL)
      list(clusters = df, timing = tm, kest = kest)
    })  ## End for each resolution
  } else {
    tmp <- lapply(params$range_clusters, function(k) {  ## For each k
      cat(paste0("k = ", k, "\n"))
      ## Run clustering
      res <- get(paste0("apply_", method))(sce = sce, params = params, k = k)
      
      ## Put output in data frame
      df <- data.frame(dataset = scename, 
                       method = method, 
                       cell = names(res$cluster),
                       run = i,
                       k = k,
                       resolution = NA,
                       cluster = res$cluster,
                       stringsAsFactors = FALSE, row.names = NULL)
      tm <- data.frame(dataset = scename, 
                       method = method,
                       run = i, 
                       k = k,
                       resolution = NA,
                       user.self = res$st[["user.self"]],
                       sys.self = res$st[["sys.self"]],
                       user.child = res$st[["user.child"]],
                       sys.child = res$st[["sys.child"]],
                       elapsed = res$st[["elapsed"]],
                       stringsAsFactors = FALSE, row.names = NULL)
      kest <- data.frame(dataset = scename, 
                         method = method,
                         run = i, 
                         k = k,
                         resolution = NA,
                         est_k = res$est_k,
                         stringsAsFactors = FALSE, row.names = NULL)
      list(clusters = df, timing = tm, kest = kest)
    })  ## End for each k
  }
  
  ## Summarize across different values of k
  assignments <- do.call(rbind, lapply(tmp, function(w) w$clusters))
  timings <- do.call(rbind, lapply(tmp, function(w) w$timing))
  k_estimates <- do.call(rbind, lapply(tmp, function(w) w$kest))
  list(assignments = assignments, timings = timings, k_estimates = k_estimates)
})  ## End for each run

In [20]:
## Summarize across different runs
assignments <- do.call(rbind, lapply(L, function(w) w$assignments))
timings <- do.call(rbind, lapply(L, function(w) w$timings))
k_estimates <- do.call(rbind, lapply(L, function(w) w$k_estimates))

## Add true group for each cell
truth <- data.frame(cell = as.character(rownames(colData(sce))),
                    trueclass = as.character(colData(sce)$phenoid),
                    stringsAsFactors = FALSE)
assignments$trueclass <- truth$trueclass[match(assignments$cell, truth$cell)]

## Combine results
res_all <- list(assignments = assignments, timings = timings,
            k_estimates = k_estimates)

df_all <- dplyr::full_join(res$assignments %>%
                         dplyr::select(dataset, method, cell, run, k, 
                                       resolution, cluster, trueclass),
                       res$k_estimates %>%
                         dplyr::select(dataset, method, run, k, 
                                       resolution, est_k)
) %>% dplyr::full_join(res$timings %>% dplyr::select(dataset, method, run, k,
                                                     resolution, elapsed))

run = 1
k = 2


Loading required package: SingleCellExperiment

Loading required package: SummarizedExperiment

Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: BiocGenerics

Loading required package: parallel


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tap

k = 3
k = 4
k = 5
k = 6
k = 7
k = 8
k = 9
k = 10
k = 11
k = 12
k = 13
k = 14
k = 15
run = 2
k = 2
k = 3
k = 4
k = 5
k = 6
k = 7
k = 8
k = 9
k = 10
k = 11
k = 12
k = 13
k = 14
k = 15
run = 3
k = 2
k = 3
k = 4
k = 5
k = 6
k = 7
k = 8
k = 9
k = 10
k = 11
k = 12
k = 13
k = 14
k = 15
run = 4
k = 2
k = 3
k = 4
k = 5
k = 6
k = 7
k = 8
k = 9
k = 10
k = 11
k = 12
k = 13
k = 14
k = 15
run = 5
k = 2
k = 3
k = 4
k = 5
k = 6
k = 7
k = 8
k = 9
k = 10
k = 11
k = 12
k = 13
k = 14
k = 15


ERROR: Error in dplyr::full_join(res$assignments %>% dplyr::select(dataset, method, : 함수 "%>%"를 찾을 수 없습니다


In [21]:
res

dataset,method,cell,run,k,resolution,cluster,trueclass
<chr>,<chr>,<chr>,<int>,<dbl>,<lgl>,<int>,<chr>
sce_filteredExpr10_Koh,PCAHC,SRR3952323,1,2,,,H7hESC
sce_filteredExpr10_Koh,PCAHC,SRR3952325,1,2,,,H7hESC
sce_filteredExpr10_Koh,PCAHC,SRR3952326,1,2,,,H7hESC
sce_filteredExpr10_Koh,PCAHC,SRR3952327,1,2,,,H7hESC
sce_filteredExpr10_Koh,PCAHC,SRR3952328,1,2,,,H7hESC
sce_filteredExpr10_Koh,PCAHC,SRR3952329,1,2,,,H7hESC
sce_filteredExpr10_Koh,PCAHC,SRR3952330,1,2,,,H7hESC
sce_filteredExpr10_Koh,PCAHC,SRR3952331,1,2,,,H7hESC
sce_filteredExpr10_Koh,PCAHC,SRR3952337,1,2,,,H7hESC
sce_filteredExpr10_Koh,PCAHC,SRR3952341,1,2,,,H7hESC

dataset,method,run,k,resolution,user.self,sys.self,user.child,sys.child,elapsed
<chr>,<chr>,<int>,<dbl>,<lgl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
sce_filteredExpr10_Koh,PCAHC,1,2,,,,,,
sce_filteredExpr10_Koh,PCAHC,1,3,,25.187,21.264,0,0,1.528
sce_filteredExpr10_Koh,PCAHC,1,4,,24.521,19.120,0,0,1.059
sce_filteredExpr10_Koh,PCAHC,1,5,,24.429,19.486,0,0,1.063
sce_filteredExpr10_Koh,PCAHC,1,6,,25.128,20.127,0,0,1.091
sce_filteredExpr10_Koh,PCAHC,1,7,,25.350,20.794,0,0,1.109
sce_filteredExpr10_Koh,PCAHC,1,8,,24.388,18.916,0,0,1.054
sce_filteredExpr10_Koh,PCAHC,1,9,,24.387,19.825,0,0,1.072
sce_filteredExpr10_Koh,PCAHC,1,10,,25.845,21.182,0,0,1.129
sce_filteredExpr10_Koh,PCAHC,1,11,,24.438,19.299,0,0,1.062

dataset,method,run,k,resolution,est_k
<chr>,<chr>,<int>,<dbl>,<lgl>,<lgl>
sce_filteredExpr10_Koh,PCAHC,1,2,,
sce_filteredExpr10_Koh,PCAHC,1,3,,
sce_filteredExpr10_Koh,PCAHC,1,4,,
sce_filteredExpr10_Koh,PCAHC,1,5,,
sce_filteredExpr10_Koh,PCAHC,1,6,,
sce_filteredExpr10_Koh,PCAHC,1,7,,
sce_filteredExpr10_Koh,PCAHC,1,8,,
sce_filteredExpr10_Koh,PCAHC,1,9,,
sce_filteredExpr10_Koh,PCAHC,1,10,,
sce_filteredExpr10_Koh,PCAHC,1,11,,
