# Install Libraries
Install information can be found [here](https://satijalab.org/seurat/articles/install.html)\
Vignette [here](https://satijalab.org/seurat/articles/integration_mapping.html)

In [1]:
#Seurat parameters
reference_data = "path-to-seurat-object" # Seurat object for reference data
query_data = "path-to-seurat-object" # Seurat object for query data

genome = "genome-name" # either hg38 or mm10

normalization_method = "LogNormalize"
normalization_scale_factor = 10000

variable_features_method = "vst"
variable_features_num = 2000

# Dimensional reduction to use for the weighting anchors.
weight.reduction = "pca" 
n_dims = 30 # Set of dimensions to use in the anchor weighting procedure. If NULL, the same dimensions that were used to find anchors will be used for weighting.

threads = 8
prefix = "prefix" #project name

#Papermill specific parameters
papermill = TRUE

In [2]:
# #########################
# # For test
# reference_data = "../../../ReferenceData/BrainAgingSpatialAtlas_snRNAseq.rds"
# query_data = "../../../QueryData/MouseBrain/SS-PKR-129-192-PLATE1-LEFT-HALF.rna.seurat.filtered_rds.mm10.rds"
# genome = "mm10"

In [3]:
papermill <- as.logical(papermill)

In [4]:
suppressMessages(library(Seurat))
suppressMessages(library(future))
suppressMessages(library(logr))
suppressMessages(library(dplyr))
suppressMessages(library(grid))
suppressMessages(library(gridExtra))
suppressMessages(library(ggplot2))
suppressMessages(library(patchwork))
suppressMessages(library(cowplot))
suppressMessages(library(EnsDb.Mmusculus.v79))
suppressMessages(library(EnsDb.Hsapiens.v86))

In [None]:
future.seed=TRUE
plan("multisession", workers = threads)
options("logr.notes" = FALSE)
options(future.globals.maxSize=10e9)
set.seed(1234)

In [5]:
# Function to convert gene ID to symbol
create_seurat_obj_with_gene_symbol <- function(object, genome){

    # get gene symbol
    if(genome == "hg38" | genome == "hg37"){
        gene.id <- ensembldb::select(EnsDb.Hsapiens.v86, 
                                     keys= rownames(object), 
                                     keytype = "GENEID", 
                                     columns = c("SYMBOL","GENEID"))
    
    } else if(genome == "mm10" | genome == "mm9"){
        gene.id <- ensembldb::select(EnsDb.Mmusculus.v79, 
                                     keys = rownames(object), 
                                     keytype = "GENEID", 
                                     columns = c("SYMBOL","GENEID"))
    }
    # remove genes with empty symbol
    gene.id <- subset(gene.id, gene.id$SYMBOL != "")

    # make gene symbol unique
    gene.id$Unique_SYMBOL <- make.unique(gene.id$SYMBOL, "")
    
    counts <- object@assays$RNA@counts
    counts <- counts[gene.id$GENEID, ]
    rownames(counts) <- gene.id$Unique_SYMBOL

    object <- CreateSeuratObject(counts = counts, meta.data = object@meta.data)
    
    return(object)
}

In [6]:
#Function to save plots
plot_filename = glue::glue("{prefix}.rna.cell.annotation.plots.{genome}")
dir.create(plot_filename, showWarnings=F)

printPNG <- function(name, plot, papermill, width = 22, height = 11){
    filename = glue::glue("{plot_filename}/{prefix}.rna.cell.annotation.{name}.{genome}.png")
    if(papermill){
        ggsave(plot = plot, filename = filename, width = width, height = height)
    }
}

#Create log file
logfile <- file.path(glue::glue("{prefix}.rna.cell.annotation.logfile.{genome}.txt"))
lf <- log_open(logfile)

In [7]:
# Read reference data
tryCatch(
    {
        log_print("# Reading reference data...")
        obj.ref <- readRDS(reference_data)
        log_print("SUCCESSFUL: Reading reference data")
    
    },
    error = function(cond) {
        log_print("ERROR: Reading reference data")
        log_print(cond)
    }
)

[1] "# Reading reference data..."
[1] "SUCCESSFUL: Reading reference data"


In [8]:
# Read query data
tryCatch(
    {
        log_print("# Reading query data...")
        obj.query <- readRDS(query_data)
        log_print("SUCCESSFUL: Reading query data")
    
    },
    error = function(cond) {
        log_print("ERROR: Reading query data")
        log_print(cond)
    }
)

[1] "# Reading query data..."
[1] "SUCCESSFUL: Reading query data"


In [9]:
# Convert gene ID to symbol for reference data
tryCatch(
    {
        log_print("# Converting gene id to symbol for reference data")
        obj.ref <- create_seurat_obj_with_gene_symbol(object = obj.ref, 
                                                      genome = genome)
        log_print("SUCCESSFUL: Converting gene id to symbol for reference data")

    },
    error = function(cond) {
        log_print("ERROR: Converting gene id to symbol for reference data")
        log_print(cond)
    }
)

[1] "# Converting gene id to symbol for reference data"
[1] "SUCCESSFUL: Converting gene id to symbol for reference data"


In [10]:
# Subset reference data
tryCatch(
    {
        log_print("# Subseting reference and query data with common genes")
        
        gene.common <- intersect(rownames(obj.ref), rownames(obj.query))
        obj.ref <- subset(obj.ref, features = gene.common)
        obj.query <- subset(obj.query, features = gene.common)
        
        log_print(glue::glue("# Found {length(gene.common)} common genes between reference and query data"))
        log_print("SUCCESSFUL: Subseting reference data")
    },
     error = function(cond) {
        log_print("ERROR: Subseting reference data")
        log_print(cond)
    }
    
)

[1] "# Subseting reference and query data with common genes"


"Not all features provided are in this Assay object, removing the following feature(s): Tafa1, Gm32338, BC005561, Nrg1, Gm32647, Gm42418, Gm10754, Tafa2, Gm10419, Gm26871, mt-Co1, 2010300C02Rik, Gmnc, 2700081O15Rik, 4921539H07Rik, Gm33228, Gm20754, Adgrl4, Rflnb, Lhfpl3, March1, Tmem94, Gm10649, mt-Nd1, Gm5127, Epb41l4a, Drd1, Gm28905, Mrm2, Pakap.1, Gm5820, Rmst, C78859, Gm14051, Gm10516, C230072F16Rik, Gm30371, C130073E24Rik, Gm39185, Gm44593, Twnk, Rtl4, Gm867, Gm38413, Gm12367, Gm49969, Gm45356, Gm6260, BC051408, Gm20457, Gm31645, Gm13561, Ints11, Gm14412, Gm11906, Rps6ka2, Dele1, Gm49678, Tmem131l, Gm20125, Gm45459, Gm49164, Sdhaf4, Gm11542, Adgrl2, Gm1604a, Usf3, Ccn2, Gm42196, Gm15587, Gm4890, Phf24, Gm43598, Gm42851, P3h1, Gm30382, Gm27188, Gm26673, Mrvi1, Vxn, Gsdme, Gm33677, Cip2a, Gm20387, 4930547E14Rik, Znrd2, C530008M17Rik, Gm2164, Gm14033, Gm39043, Plpp3, Cramp1l, Gm12689, Gm26621, Gm38560, 4930488L21Rik, Fam71d, 5330417C22Rik, 4933406B17Rik, Gm19325, 9530018F02Rik, Gm478

# Found 15561 common genes between reference and query data
[1] "SUCCESSFUL: Subseting reference data"


In [11]:
# Predict labels for query dataset
tryCatch(
    {
        log_print("# Predicting labels for query data")
        
        obj.ref <- obj.ref %>%
            NormalizeData(verbose = FALSE) %>%
            FindVariableFeatures(selection.method = variable_features_method, 
                                 nfeatures = variable_features_num)
        
        obj.query <- obj.query %>%
            NormalizeData(verbose = FALSE) %>%
            FindVariableFeatures(selection.method = variable_features_method,
                                 nfeatures = variable_features_num)
        
        transfer.anchors <- FindTransferAnchors(
            reference = obj.ref,
            query = obj.query,
            reduction = "cca",
            verbose = TRUE
        )
        
        predictions <- TransferData(anchorset = transfer.anchors, 
                                    refdata = obj.ref$cell_type,
                                    weight.reduction = obj.query[[weight.reduction]],
                                    dims = 1:n_dims,
                                   verbose = TRUE)
        
        obj.query <- AddMetaData(obj.query, metadata = predictions)
        
        log_print("SUCCESSFUL: Predicting labels for query data")
    },
     error = function(cond) {
        log_print("ERROR: Predicting labels for query data")
        log_print(cond)
    }
    
)

[1] "# Predicting labels for query data"


Running CCA

Merging objects

Finding neighborhoods

Finding anchors

	Found 7752 anchors

Filtering anchors

	Retained 3178 anchors

Finding integration vectors

Finding integration vector weights

Predicting cell labels



[1] "SUCCESSFUL: Predicting labels for query data"


In [43]:
head(predictions)

Unnamed: 0_level_0,predicted.id,prediction.score.neuron,prediction.score.medium.spiny.neuron,prediction.score.inhibitory.interneuron,prediction.score.oligodendrocyte,prediction.score.astrocyte,prediction.score.microglial.cell,prediction.score.oligodendrocyte.precursor.cell,prediction.score.vascular.leptomeningeal.cell,prediction.score.cell,prediction.score.T.cell,prediction.score.endothelial.cell,prediction.score.pericyte,prediction.score.macrophage,prediction.score.max
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
"ACAGTGGT,AACCTATA,GCAACATT,SS-PKR-129-192-PLATE1-LEFT-HALF",neuron,0.56564437,0.32868598,0.07610938,0.029560269,0.0,0,0,0.0,0.0,0,0.0,0.0,0,0.5656444
"ACAGTGGT,AAGGTTAA,CAGATCTG,SS-PKR-129-192-PLATE1-LEFT-HALF",neuron,0.45674741,0.23478539,0.3084672,0.0,0.0,0,0,0.0,0.0,0,0.0,0.0,0,0.4567474
"ACAGTGGT,AAGGTTAA,TCTGCTGT,SS-PKR-129-192-PLATE1-LEFT-HALF",endothelial cell,0.09658323,0.0,0.20521058,0.0,0.04141287,0,0,0.2239648,0.0,0,0.2642176,0.1686109,0,0.2642176
"ACAGTGGT,AATCTCGC,CCAATTCC,SS-PKR-129-192-PLATE1-LEFT-HALF",astrocyte,0.08183617,0.0,0.01318612,0.223675201,0.56116519,0,0,0.0,0.12013731,0,0.0,0.0,0,0.5611652
"ACAGTGGT,AATTGAGT,TAGCTTGT,SS-PKR-129-192-PLATE1-LEFT-HALF",astrocyte,0.24459504,0.01442037,0.03508238,0.006232344,0.69966986,0,0,0.0,0.0,0,0.0,0.0,0,0.6996699
"ACAGTGGT,ACAGTGGT,CATTCTGA,SS-PKR-129-192-PLATE1-LEFT-HALF",astrocyte,0.0,0.0,0.01972702,0.057902093,0.90854163,0,0,0.0,0.01382926,0,0.0,0.0,0,0.9085416


In [15]:
## Plotting
tryCatch(
    {
        log_print("# Plotting predicted labels")
        
        p1 <- DimPlot(obj.query, group.by = "seurat_clusters", label = TRUE, 
                      label.size = 5, repel = TRUE)

        p2 <- DimPlot(obj.query, group.by = "predicted.id", label = TRUE, 
                      label.size = 5, repel = TRUE)
        
        p <- p1 + p2
        
        printPNG(name = "predicted.labels", plot = p, papermill = papermill, width = 15, height = 6)
        
        log_print("SUCCESSFUL: Plotting predicted labels")
        

    },
    error = function(cond) {
        log_print("ERROR: Plotting predicted labels")
        log_print(cond)
    }

)

[1] "# Plotting predicted labels"
[1] "SUCCESSFUL: Plotting predicted labels"


In [42]:
## Plotting
tryCatch(
    {
        log_print("# Plotting predicted score per cluster")
        
        sel_cols <- grep("prediction.score|seurat_clusters", colnames(obj.query@meta.data), value=TRUE)
        sel_cols <- sel_cols[1:length(sel_cols) - 1]

        df <- obj.query@meta.data %>%
            subset(select = sel_cols) %>%
            tidyr::gather(key = "celltype", value = "score", -seurat_clusters)
        
        df$celltype <- stringr::str_replace_all(df$celltype, "prediction.score.", "")
        
        p <- ggplot(df, aes(x = celltype, y = score)) +
             geom_violin(aes(fill = celltype), scale = "width") +
             facet_wrap(~seurat_clusters, ncol = 4) +
             theme_cowplot() +
             xlab("") + ylab("Predictied score") +
             theme(axis.text.x = element_text(angle=60, hjust = 1),
                  legend.position = "none",
                  plot.title = element_text(hjust = 0.5)) 

        # decide figure size
        n_clusters <- length(unique(df$seurat_clusters))
        n_rows <- ceiling(n_clusters / 4)
        
        printPNG(name = "predicted.scores", plot = p, papermill = papermill, width = 3*4, height = 3*n_rows + 2)
        
        log_print("SUCCESSFUL: Plotting predicted score per cluster")
    },
    error = function(cond) {
        log_print("ERROR: Plotting predicted score per cluster")
        log_print(cond)
    }

)

[1] "# Plotting predicted score per cluster"
[1] "SUCCESSFUL: Plotting predicted score per cluster"
