# Batch correction and integrated analysis of ependymoma samples
Regrettably, seems gojo et al aggregated all of their sequencing by sample, so impossible now to
batch correct on sequencing type (scSmart-seq2, snSmart-seq2, 10X Genomics) without reanalyzing from scratch.

Current best practice advises against analysis at the feature (gene) level after batch integration.


In [None]:
# Load all required libraries
Sys.setenv(LANGUAGE = "en") # set language to "ja" if you prefer
library(Seurat)
library(tidyverse)
library(harmony)
library(ggplot2)
library(future)
library(ggalluvial)
library(mclust)

In [None]:
# Set parallel execution settings
future::plan("multisession", workers = as.integer(availableCores()/2), gc = TRUE)
options(future.globals.maxSize = 1024*8*1024^2) # Set max variable size to 8Gb

In [None]:
# Load data
DATA_DIR = file.path('data','external','gojo_et_al') # change this if you put your data somewhere other than ./data/external/gojo_et_al/
import_gojo_data <- function(counts_file,metadata_file,path){
    counts_path = file.path(DATA_DIR,counts_file)
    metadata_path = file.path(DATA_DIR,metadata_file)
    counts = read.table(counts_path)
    meta = read.table(metadata_path)    
    obj <- CreateSeuratObject(counts = counts , meta.data = meta) %>% suppressWarnings
    return(obj)
}
clean_pdx <- function(pdx){
    # clean the pdx metadata.
    # Delete column V1 (duplicate of row indices)
    # rename column V2
    names(pdx[[]])[names(pdx[[]]) == 'V2'] <- 'sample'
    pdx@meta.data[c('V1','V2')] <- NULL
    pdx$annotation <- pdx$sample
    return(pdx)
}

In [None]:
# we're loading a lot of data so this will take a long time
pf = import_gojo_data('PF_EPN_counts_200519lj.txt','PF_EPN_metadata_200519lj.txt',DATA_DIR)
sp = import_gojo_data('SP_EPN_counts_200519lj.txt','SP_EPN_metadata_200519lj.txt',DATA_DIR)
st = import_gojo_data('ST_EPN_counts_200519lj.txt','ST_EPN_metadata_200519lj.txt',DATA_DIR)
pdx = import_gojo_data('PDX_counts.txt','PDX_metadata.txt',DATA_DIR) %>% clean_pdx
pairs = import_gojo_data('Matched_pair_counts_200519lj.txt','Matched_pair_metadata_200519lj.txt',DATA_DIR)

pf
sp
st
pdx
pairs

In [None]:
# merge into 1 dataset
gojo <- merge(pf, y=c(sp,st,pdx,pairs)) %>% JoinLayers
gojo
# list samples
get_sample_names <- function(seuratobject){
    return(seuratobject@meta.data$sample %>% unique())
}
get_sample_names(gojo)

In [None]:
# Define QC thresholds
ncount_lower_threshold = 1000
ncount_upper_threshold = 1000000
nfeature_lower_threshold = 1000
nfeature_upper_threshold = 10000

# Plot threshods
options(repr.plot.width = 16, repr.plot.height =6)
VlnPlot(gojo, features="nCount_RNA", group.by='sample', log=TRUE) + 
    geom_hline(yintercept=ncount_lower_threshold, color='red') + 
    geom_hline(yintercept=ncount_upper_threshold, color='red')
VlnPlot(gojo, features="nFeature_RNA", group.by='sample') +
    geom_hline(yintercept=nfeature_lower_threshold,color='red') + 
    geom_hline(yintercept=nfeature_upper_threshold,color='red')

# apply QC filter
gojo<-subset(gojo, nFeature_RNA > nfeature_lower_threshold & nFeature_RNA < nfeature_upper_threshold &
                        nCount_RNA > ncount_lower_threshold & nCount_RNA < ncount_upper_threshold)
gojo
# 161 cells filtered

In [None]:
# scTransform should be performed per sample. See
# https://github.com/satijalab/seurat/issues/5306
# https://satijalab.org/seurat/archive/v4.3/sctransform_v2_vignette
gojo[["RNA"]] <- split(gojo[["RNA"]], f = gojo$sample)
DefaultAssay(gojo) <- "RNA"
gojo

In [None]:
# Normalization
# For details see https://satijalab.org/seurat/articles/sctransform_vignette.html
# This takes a long time (>1h on 12 cores)
gojo <- SCTransform(gojo, verbose = TRUE, vst.flavor = "v2")

In [None]:
# Checkpoint: normalization took forever so we save this as an .rds file
rds <- file.path('data','gojo_sctransformed_seuratobj.rds')
saveRDS(gojo, file = rds)

In [None]:
# Checkpoint: load this file if you don't want to wait an hour for SCTransform.
rds <- file.path('data','gojo_sctransformed_seuratobj.rds')
gojo <- readRDS(file = rds)
gojo

In [None]:
# If we cluster and plot after normalization but before batch correction, we largely get a soup that doesn't segregate by
# sample or by annotation.
gojo <- RunPCA(gojo, verbose = FALSE, reduction.name="normalized.pca")
gojo <- RunUMAP(gojo, dims = 1:30, verbose = FALSE, reduction="normalized.pca",reduction.name="normalized.umap")
gojo <- FindNeighbors(gojo, dims = 1:30, verbose = FALSE, reduction="normalized.pca",graph.name="normalized.snn")
gojo <- FindClusters(gojo, verbose = FALSE, graph.name="normalized.snn", cluster.name="normalized.clusters")


In [None]:
print(c('Cluster similarity to sample IDs: ',mclust::adjustedRandIndex(
    gojo[[]]$sample,
    gojo[[]]$normalized.clusters)))
print(c('Cluster similarity to cell types: ',mclust::adjustedRandIndex(
    gojo[[]]$annotation,
    gojo[[]]$normalized.clusters)))

options(repr.plot.width = 16, repr.plot.height =8)
DimPlot(gojo, reduction="normalized.umap", label=TRUE, group.by="annotation", label.size=6, repel=TRUE)

In [None]:
# Batch correction
# TODO: wrap in function
gojo <- gojo %>% IntegrateLayers(
    method = HarmonyIntegration,
    orig.reduction = "normalized.pca", new.reduction = "harmony",
    normalization.method = "SCT"
)
gojo <- RunUMAP(gojo, dims = 1:30, verbose = FALSE, reduction="harmony",reduction.name="harmony.umap")
gojo <- FindNeighbors(gojo, dims = 1:30, verbose = FALSE, reduction="harmony",graph.name="harmony.snn")
gojo <- FindClusters(gojo, verbose = FALSE, graph.name="harmony.snn", cluster.name="harmony.clusters")

print(c('Cluster similarity to sample IDs: ',mclust::adjustedRandIndex(
    gojo[[]]$sample,
    gojo[[]]$harmony.clusters)))
print(c('Cluster similarity to cell types: ',mclust::adjustedRandIndex(
    gojo[[]]$annotation,
    gojo[[]]$harmony.clusters)))


In [None]:
options(repr.plot.width = 16, repr.plot.height =8)
DimPlot(gojo, reduction="harmony.umap", label=TRUE, group.by="annotation", label.size=6, repel=TRUE)
DimPlot(gojo, reduction="harmony.umap", label=TRUE, group.by="harmony.clusters", label.size=6, repel=TRUE)

In [None]:
options(repr.plot.width = 16, repr.plot.height =8)

VlnPlot(gojo, features = 'L1CAM', group.by='annotation')
# L1CAM restricted to PF-Neuronal-Precursor-like and ST cell types.
VlnPlot(gojo, features = 'L1CAM', group.by='sample')
# BT1030 and CPDM0785 do not resemble RELA ependymomas in L1CAM expression.
# MUV006 (ST-YAP1) does not express L1CAM.

In [None]:
microglia_markers = c('CD14', 'FCER1G', 'CSF1R')
tcell_markers = c('CD3E', 'CD4', 'CD8A')
opc_markers = c('OLIG1', 'APOD', 'PDGFRA')
oligodendrocyte_markers = c('MBP', 'PLP1', 'MOG')
other_markers = c('L1CAM')
markers = c(microglia_markers,tcell_markers,opc_markers,oligodendrocyte_markers,other_markers)
DotPlot(gojo, features = markers, group.by = "annotation") #+ RotatedAxis()

In [None]:
#TODO: genes correlated with L1CAM expression
FeaturePlot(gojo, features = "L1CAM",pt.size=2,reduction='harmony.umap')

# What cell types express L1CAM?
Markers: GLI2, CD276

In [None]:
DimPlot(gojo)

# Less useful plots

In [None]:
# take a closer look at only the ST ependymoma samples
options(repr.plot.width = 24, repr.plot.height=8)
gojo_subset = gojo %>% subset(
    sample %in% c('MUV043','MUV043Nuc1','MUV043Nuc2','MUV056','Peds4','BT165PDX') &
    annotation %in% c('ST-Ependymal-like','ST-G2M-Phase','ST-Interferon-Response','ST-Metabolic','ST-Midline','ST-Neuronal-Precursor-like','ST-RELA-Variable',
                     'ST-Radial-Glia-like','ST-S-Phase','ST-YAP1',NA)
)
VlnPlot(gojo_subset, features = 'L1CAM', split.by='annotation',group.by='sample')
# No obvious differences by cell type in within-sample variation of L1CAM.

In [None]:
# Alluvial diagram; too complicated to be really useful.
frequency_table <- gojo[[]] %>%
  group_by(sample, harmony.clusters, annotation) %>%
  summarise(count = n(), .groups = 'drop')

ggplot(frequency_table,
       aes(axis1 = sample,
           axis2 = harmony.clusters,
           axis3 = annotation,
           y = count)) +
  geom_alluvium(aes(fill = annotation)) +
  geom_stratum() +
  geom_text(stat = "stratum", 
            aes(label = after_stat(stratum)))

# Trying alternative batch corrections
See https://satijalab.org/seurat/articles/integration_introduction#perform-integration-with-sctransform-normalized-datasets

In [None]:

gojo <- gojo %>% IntegrateLayers(
    method = CCAIntegration,
    orig.reduction = "normalized.pca", new.reduction = "cca",
    normalization.method = "SCT",
    k.weight=50
)
gojo <- RunUMAP(gojo, dims = 1:30, verbose = FALSE, reduction="cca",reduction.name="cca.umap")
gojo <- FindNeighbors(gojo, dims = 1:30, verbose = FALSE, reduction="cca",graph.name="cca.snn")
gojo <- FindClusters(gojo, verbose = FALSE, graph.name="cca.snn", cluster.name="cca.clusters")

print(c('Cluster similarity to sample IDs: ',mclust::adjustedRandIndex(
    gojo[[]]$sample,
    gojo[[]]$cca.clusters)))
print(c('Cluster similarity to cell types: ',mclust::adjustedRandIndex(
    gojo[[]]$annotation,
    gojo[[]]$cca.clusters)))
# CCA seems to be able to distinguish OPCs and cycling cells, but has trouble distinguishing clusters. Probably not useful.

In [None]:
options(repr.plot.width = 16, repr.plot.height =8)
DimPlot(gojo, reduction="cca.umap", label=TRUE, group.by="annotation", label.size=6, repel=TRUE)
DimPlot(gojo, reduction="cca.umap", label=TRUE, group.by="cca.clusters", label.size=6, repel=TRUE)
DimPlot(gojo, reduction="cca.umap", label=TRUE, group.by="sample", label.size=6, repel=TRUE)

In [None]:
gojo <- gojo %>% IntegrateLayers(
    method = RPCAIntegration,
    orig.reduction = "normalized.pca", new.reduction = "rpca",
    normalization.method = "SCT"
)
gojo <- RunUMAP(gojo, dims = 1:30, verbose = FALSE, reduction="rpca",reduction.name="rpca.umap")
gojo <- FindNeighbors(gojo, dims = 1:30, verbose = FALSE, reduction="rpca",graph.name="rpca.snn")
gojo <- FindClusters(gojo, verbose = FALSE, graph.name="rpca.snn", cluster.name="rpca.clusters")

print(c('Cluster similarity to sample IDs: ',mclust::adjustedRandIndex(
    gojo[[]]$sample,
    gojo[[]]$rpca.clusters)))
print(c('Cluster similarity to cell types: ',mclust::adjustedRandIndex(
    gojo[[]]$annotation,
    gojo[[]]$rpca.clusters)))
# 

In [None]:
options(repr.plot.width = 16, repr.plot.height =8)
DimPlot(gojo, reduction="rpca.umap", label=TRUE, group.by="annotation", label.size=6, repel=TRUE)
DimPlot(gojo, reduction="rpca.umap", label=TRUE, group.by="cca.clusters", label.size=6, repel=TRUE)
DimPlot(gojo, reduction="rpca.umap", label=TRUE, group.by="sample", label.size=6, repel=TRUE)