Analysis: Vladyslav Kavaka (vladyslav.kavaka@med.uni-muenchen.de), Eduardo Beltran (eduardo.beltran@med.uni-muenchen.de)
Insitute of Clinical Neuroimmunology, LMU, Munich

In [4]:
sessionInfo()
set.seed(1)

R version 4.0.5 (2021-03-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 18.04.6 LTS

Matrix products: default
BLAS/LAPACK: /home/INIM/vladyslav.kavaka/miniconda3/envs/azimuth/lib/libopenblasp-r0.3.17.so

locale:
 [1] LC_CTYPE=C.UTF-8    LC_NUMERIC=C        LC_TIME=C          
 [4] LC_COLLATE=C        LC_MONETARY=C       LC_MESSAGES=C      
 [7] LC_PAPER=C          LC_NAME=C           LC_ADDRESS=C       
[10] LC_TELEPHONE=C      LC_MEASUREMENT=C    LC_IDENTIFICATION=C

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] celldex_1.0.0               Nebulosa_1.0.2             
 [3] harmony_0.1.0               Rcpp_1.0.8                 
 [5] enrichR_3.0                 qpcR_1.4-1                 
 [7] robustbase_0.93-9           rgl_0.108.3                
 [9] minpack.lm_1.2-1            MASS_7.3-55                
[11] MAST_1.16.0                 SingleCellExperiment

## Imports

In [2]:
library(devtools)
library(Seurat)
library(dplyr)
library(Matrix)
library(tidyr)
library(limma)
library(ggplot2)
library(ggthemes)
library(patchwork)
library(gprofiler2)
library(ggrepel)
library(scales)
library(ggthemes)
library(purrr)
library(MAST)
library(qpcR)
library(enrichR)
library(harmony)
library(Nebulosa)
library(celldex)

## Custom Utility Functions

The figsize in R is specified in inches, 1 inch = 2.54 cm.

In [None]:
set_figsize <- function(width, height){
    options(repr.plot.width = width, 
            repr.plot.height = height)
}

# Load in the data

In [None]:
#load in the data, explore the structure

load('pathway/dataset_sc_Steidl/EGAF00004049210/HL_merged_sc_obj/HL_merged_sc_obj.Rdata')

# Create Seurat object

In [None]:
obj <- as.Seurat(sc.obj, counts = 'counts')

In [None]:
obj <- RenameAssays(obj, originalexp = 'RNA')

In [None]:
obj <- DietSeurat(obj, assays = 'RNA', counts = TRUE, scale.data = FALSE, dimreducs = NULL, graphs = NULL)

In [None]:
obj

## QC

In [None]:
# The number of features and UMIs (nFeature_RNA and nCount_RNA) are automatically calculated for every object by Seurat.
# For non-UMI data, nCount_RNA represents the sum of the non-normalized values within a cell
# We calculate the percentage of mitochondrial features here and store it in object metadata as `percent.mito`.
# We use raw count data since this represents non-transformed and non-log-normalized counts
# The % of UMI mapping to MT-features is a common scRNA-seq QC metric.
mito.features <- grep(pattern = "^MT-", x = rownames(x = obj), value = TRUE)
percent.mito <- Matrix::colSums(x = GetAssayData(object = obj, slot = 'counts')[mito.features, ]) / Matrix::colSums(x = GetAssayData(object = obj, slot = 'counts'))

In [None]:
# The [[ operator can add columns to object metadata, and is a great place to stash QC stats
obj[['percent.mito']] <- percent.mito
plot <- VlnPlot(object = obj, features = c("nFeature_RNA"), ncol = 3, pt.size = 0.000001)
l1 <- plot$layers[[1]]
l2 <- plot$layers[[2]]

plot$layers[[1]] <- l2
plot$layers[[2]] <- l1

plot

plot <- VlnPlot(object = obj, features = c("nCount_RNA"), ncol = 3, pt.size = 0.000001)
l1 <- plot$layers[[1]]
l2 <- plot$layers[[2]]

plot$layers[[1]] <- l2
plot$layers[[2]] <- l1

plot

plot <- VlnPlot(object = obj, features = c("percent.mito"), ncol = 3, pt.size = 0.000001)
l1 <- plot$layers[[1]]
l2 <- plot$layers[[2]]

plot$layers[[1]] <- l2
plot$layers[[2]] <- l1

plot

# Integration

In [None]:
obj@meta.data$sample <- obj@meta.data$dataset
unique(obj$sample)

In [None]:
Sys.time()
obj.list <- SplitObject(obj, split.by = "sample")

In [None]:
markers.remove <- grep(pattern = c("^TRAV|^TRBV|^TRGV|^TRDV|^IGKV|^IGLV|^IGHV|^IGHG|^IGK"),  x = rownames(x = obj), value = TRUE)

In [None]:
obj.list <- lapply(X = obj.list, FUN = function(x) {
    x <- NormalizeData(x)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 3000)
})

for (i in 1:length(obj.list)){
     VariableFeatures(obj.list[[i]]) <- VariableFeatures(object = obj.list[[i]])[!(VariableFeatures(object = obj.list[[i]])%in%markers.remove)]
}
Sys.time()

In [None]:
features <- SelectIntegrationFeatures(object.list = obj.list, nfeatures = 2500)

In [None]:
Sys.time()

In [None]:
obj.list <- lapply(X = obj.list, FUN = function(x) {
    x <- ScaleData(x, features = features, verbose = FALSE)
    x <- RunPCA(x, features = features, verbose = FALSE)
})
Sys.time()

In [None]:
obj.anchors <- FindIntegrationAnchors(object.list = obj.list, anchor.features = features, reduction = "rpca")
Sys.time()

In [None]:
obj.integrated <- IntegrateData(anchorset = obj.anchors)
Sys.time()

In [None]:
obj.integrated

In [None]:
saveRDS(obj.integrated, file = './obj.integrated.withoutumap.rds')

# Working with integrated file

In [None]:
obj.integrated <- ScaleData(obj.integrated, features = VariableFeatures(object = obj.integrated), vars.to.regress = c("nCount_RNA", "percent.mito"))
obj.integrated <- RunPCA(obj.integrated, features = VariableFeatures(object = obj.integrated))

In [None]:
default_width <- 11
set_figsize(default_width, default_width)

In [None]:
Idents(obj.integrated) <- 'sample'
DimPlot(obj.integrated, reduction = 'pca', label = TRUE)

In [None]:
Idents(obj.integrated) <- 'sample'
VlnPlot(obj.integrated, "ACTB", pt.size = 0)

In [None]:
# ProjectDim scores each feature in the dataset (including features not included in the PCA) based on their correlation 
# with the calculated components. Though we don't use this further here, it can be used to identify markers that 
# are strongly correlated with cellular heterogeneity, but may not have passed through variable feature selection. 
# The results of the projected PCA can be explored by setting `projected = TRUE`in the functions above
obj.integrated <- ProjectDim(object = obj.integrated)

In [None]:
ElbowPlot(object = obj.integrated, ndims = 50)

In [None]:
DimHeatmap(object = obj.integrated, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(object = obj.integrated, dims = 2, cells = 500, balanced = TRUE)
DimHeatmap(object = obj.integrated, dims = 3, cells = 500, balanced = TRUE)
DimHeatmap(object = obj.integrated, dims = 4, cells = 500, balanced = TRUE)
DimHeatmap(object = obj.integrated, dims = 5, cells = 500, balanced = TRUE)
DimHeatmap(object = obj.integrated, dims = 6, cells = 500, balanced = TRUE)
DimHeatmap(object = obj.integrated, dims = 7, cells = 500, balanced = TRUE)
DimHeatmap(object = obj.integrated, dims = 8, cells = 500, balanced = TRUE)
DimHeatmap(object = obj.integrated, dims = 9, cells = 500, balanced = TRUE)
DimHeatmap(object = obj.integrated, dims = 10, cells = 500, balanced = TRUE)
DimHeatmap(object = obj.integrated, dims = 11, cells = 500, balanced = TRUE)
DimHeatmap(object = obj.integrated, dims = 12, cells = 500, balanced = TRUE)
DimHeatmap(object = obj.integrated, dims = 13, cells = 500, balanced = TRUE)
DimHeatmap(object = obj.integrated, dims = 14, cells = 500, balanced = TRUE)
DimHeatmap(object = obj.integrated, dims = 15, cells = 500, balanced = TRUE)
DimHeatmap(object = obj.integrated, dims = 16, cells = 500, balanced = TRUE)
DimHeatmap(object = obj.integrated, dims = 17, cells = 500, balanced = TRUE)
DimHeatmap(object = obj.integrated, dims = 18, cells = 500, balanced = TRUE)
DimHeatmap(object = obj.integrated, dims = 19, cells = 500, balanced = TRUE)
DimHeatmap(object = obj.integrated, dims = 20, cells = 500, balanced = TRUE)

# Cluster the cells

In [None]:
dim_number <- 30

In [None]:
obj.integrated <- FindNeighbors(object = obj.integrated, dims = 1:dim_number)

In [None]:
res <- 0.8

In [None]:
obj.integrated <- FindClusters(object = obj.integrated, resolution = res)

# Run Non-linear dimensional reduction (UMAP)

In [None]:
obj.integrated <- RunUMAP(obj.integrated, dims = 1:dim_number)

In [None]:
obj <- obj.integrated

## Sample Effect, Chip, Old clusters

In [None]:
default_width <- 12

In [None]:
set_figsize(default_width, default_width)
fig(DimPlot(obj, group.by="sample", label = T), 
    "UMAP_Sample")

In [None]:
set_figsize(default_width, default_width)
fig(DimPlot(obj, group.by="Chip", label = T), 
    "UMAP_Chip")

In [None]:
set_figsize(default_width, default_width)
fig(DimPlot(obj, group.by="cluster_name", label = T), 
    "UMAP_oldclusters")

In [None]:
set_figsize(default_width,default_width)
fig(DimPlot(obj, reduction = "umap", label=TRUE, label.size=6, group.by="seurat_clusters"), 
    "Cluster_UMAP")

## Cell Cycle

In [None]:
obj <- CellCycleScoring(obj, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)

In [None]:
set_figsize(default_width, default_width)
fig(DimPlot(obj, group.by="Phase"), 
    "UMAP_Cellcycle")

## Cluster QC-Metrics

In [None]:
default_width <- 8

In [None]:
set_figsize(default_width,3*default_width)
fig(VlnPlot(obj, c("nFeature_RNA", "nCount_RNA", "percent.mito"), ncol=1, group.by="seurat_clusters", pt.size=0), 
    "Cluster_QC")

### Cluster Cell Cyle

In [None]:
set_figsize(default_width, default_width)
ggplot(obj@meta.data, aes_string(x="seurat_clusters", fill="Phase")) +
    geom_bar(position="fill")

### Sample Proportions of Each Cluster

In [None]:
set_figsize(2*default_width, default_width)
fig(((ggplot(obj@meta.data, aes_string(x="seurat_clusters", fill="sample")) +
    geom_bar(position="stack") + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))) +
    ggtitle("Absolute") +
(ggplot(obj@meta.data, aes_string(x="seurat_clusters", fill="sample")) +
    geom_bar(position="fill") + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
         panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black")) +
    ggtitle("Relative"))), 
    "Cluster_Sample_Proportion")

In [None]:
set_figsize(1.5*default_width, default_width)

df <- obj@meta.data
df$clusters <- df$seurat_clusters
clusters <- unique(df$clusters)
dis_state <- 'HL'
df$dis <- df$Type
df$dis <- factor(df$dis, levels = c('RLN', 'HL'))

#determine where are the most cells from HL
order_df <- data.frame(matrix(NA, ncol = 2, nrow = length(clusters)))
colnames(order_df) <- c('cluster', 'dis')
order_df$cluster <- clusters
for(i in 1:nrow(order_df)){
    order_df$dis[i] <- nrow(filter(df, dis == dis_state & clusters == order_df$cluster[i])) / nrow(filter(df, clusters == order_df$cluster[i]))
}

order_df <- order_df[order(order_df$dis), ]
order_list <- order_df$cluster

#order the clusters in the df 
df$clusters <- factor(df$clusters, levels = order_list)

#plot the type of disease state
ggplot(df, aes_string(x="clusters", fill="dis")) +
    geom_bar(position="fill") + 
    theme(
        plot.title = element_text(hjust = 0.45),
        text = element_text(size=25),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.x = element_text(angle = 55, vjust = 1, hjust=1, colour = 'black')) +
        scale_fill_manual('legend', values = c('lightgrey', '#D3556E'))+ ylab('Fraction')+
    ggtitle("Relative")

set_figsize(2*default_width, default_width)




#plot per patient
#determine colors


ggplot(df, aes_string(x="clusters", fill="sample")) +
    geom_bar(position="fill") + 
    theme(
        plot.title = element_text(hjust = 0.45),
        text = element_text(size=25),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.x = element_text(angle = 55, vjust = 1, hjust=1, colour = 'black')) +
        ylab('Fraction') +
    ggtitle("Relative")








#plot per disease state
set_figsize(1.5*default_width, default_width)

#determine the order of clusters by fraction within the disease state
#determine where are the most cells from HL
order_df <- data.frame(matrix(NA, ncol = 2, nrow = length(clusters)))
colnames(order_df) <- c('cluster', 'dis')
order_df$cluster <- clusters
for(i in 1:nrow(order_df)){
    order_df$dis[i] <- nrow(filter(df, dis == dis_state & clusters == order_df$cluster[i])) / nrow(filter(df, dis == dis_state))
}

order_df <- order_df[order(order_df$dis), ]
order_list <- order_df$cluster

#order the clusters in the df 
df$clusters <- factor(df$clusters, levels = order_list)

#determine colors
c25 <- c(
  "dodgerblue2", "#E31A1C", # red
  "green4",
  "#6A3D9A", # purple
  "#FF7F00", # orange
  "black", "gold1",
  "skyblue2", "#FB9A99", # lt pink
  "palegreen2",
  "#CAB2D6", # lt purple
  "#FDBF6F", # lt orange
  "gray70", "khaki2",
  "maroon", "orchid1", "deeppink1", "blue1", "steelblue4",
  "darkturquoise", "green1", "yellow4", "yellow3",
  "darkorange4", "brown"
)
cols <- c25[1:length(clusters)]



ggplot(df, aes_string(x="dis", fill="clusters")) +
    geom_bar(position="fill") + 
    theme(
        plot.title = element_text(hjust = 0.45),
        text = element_text(size=25),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.x = element_text(angle = 55, vjust = 1, hjust=1, colour = 'black')) +
        ylab('Fraction')+ scale_fill_manual('legend', values = cols)+
    ggtitle("Relative")

### Cluster Size

In [None]:
set_figsize(default_width, default_width)
ggplot(obj@meta.data, aes_string(x="seurat_clusters")) + geom_bar()

In [None]:
saveRDS(obj, file = './rpca_integrated.rds')

In [None]:
obj <- readRDS(file = './rpca_integrated.rds')

In [None]:
obj

# Find Cluster Markers

## Find Markers

In [None]:
features_find <- rownames(obj)
markers.remove <- grep(pattern = c("^TRAV|^TRBV|^TRGV|^TRDV|^IGKV|^IGLV|^IGHV|^IGHG|^IGK"), x = rownames(obj), value = TRUE)
features_find <- features_find[!(features_find%in%markers.remove)]

markers <- FindAllMarkers(obj, 
                          min.pct = 0.1, 
                          logfc.threshold = 0.2, 
                          only.pos = TRUE, 
                          verbose = verbose,
                          features = features_find)

In [None]:
write.csv(markers, file = './first_markers_unsorted_rpca.csv')

In [None]:
markers <- read.csv(file = './first_markers_unsorted_rpca.csv', row.names = 1)

Sort the markers:

In [None]:
top_number <- 100
sorted.markers <- markers %>% 
                    group_by(cluster) %>% 
                    arrange(cluster, desc(avg_log2FC)) %>% 
                    slice_head(n=top_number)
sorted.markers <- sorted.markers[sorted.markers$p_val_adj < 0.05, ]

In [None]:
sorted.markers.anno <- sorted.markers

## Annotate Markers

In [None]:
library(annotables, quietly=TRUE)

In [None]:
ref <- grch38[c('symbol', 'description', 'biotype')]
ref <- distinct(ref, symbol, .keep_all=T)
sorted.markers.anno <- left_join(sorted.markers, ref, by=c('gene'='symbol'))

In [None]:
write.csv(sorted.markers.anno, file = './first_markers_annotated_rpca.csv')

## Marker Heatmap and Dotplot

In [None]:
set_figsize(1.5*default_width,1.5*default_width)

#determine the number of genes to plot
top_number <- 5
markers_plot <- markers %>% 
                    group_by(cluster) %>% 
                    arrange(cluster, desc(avg_log2FC)) %>% 
                    slice_head(n=top_number)
markers_plot <- markers_plot[markers_plot$p_val_adj < 0.05, ]


obj_plot <- obj
levels(obj_plot) <- rev(levels(obj))
obj.average <- AverageExpression(obj_plot, assay = "RNA", return.seurat = TRUE)

width <- 10.5
height <- 26

#plot the heatmap
options(repr.plot.width = width, repr.plot.height = height)
hm <- DoHeatmap(obj.average, features = markers_plot$gene, draw.lines = FALSE,size = 8, angle = 270, hjust = 1, raster = FALSE) +  
theme(text = element_text(size = 20, face = "plain", colour = 'black'),
             axis.text.y=element_text(colour="black", size = 18)) + 
        scale_fill_gradientn(colors = c("#2881C1", "white", "#D3556E"))
#ggsave(hm, file = './outs/heatmap_clusters.pdf', width = width, height = height)
hm

In [None]:
#plot the dotplot
width <- 30
height <- 8
options(repr.plot.width = width, repr.plot.height = height)


plot <- DotPlot(obj_plot, features = unique(markers_plot$gene), dot.scale = 7, cols = c('white', '#D3556E')) + RotatedAxis() +
        theme(
        text = element_text(size = 15),
        axis.text = element_text(size = 15),
        legend.text=element_text(size = 15))
plot
#ggsave(plot, file = './outs/dotplot_clusters.pdf', width = width, height = height)

# Rename cluster idents

In [None]:
#new names:
obj@meta.data$clusters_all <- 'FALSE'

obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 0] <- 'Tn_1'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 1] <- 'B_2'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 2] <- 'CD4m_1'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 3] <- 'B_m'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 4] <- 'Treg_3'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 5] <- 'Tn_2'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 6] <- 'B_3'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 7] <- 'CD4m_2'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 8] <- 'CD8m_1'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 9] <- 'CD8m_2'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 10] <- 'Treg_4'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 11] <- 'B_1'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 12] <- 'Treg_1'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 13] <- 'B_ifn'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 14] <- 'T_exh_1'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 15] <- 'Treg_2'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 16] <- 'T_ifn'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 17] <- 'Proliferative'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 18] <- 'NK-T'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 19] <- 'GCB'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 20] <- 'B_4'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 21] <- 'T_exh_2'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 22] <- 'pDC'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 23] <- 'Macrophage'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 24] <- 'Plasma'
obj@meta.data$clusters_all[obj@meta.data$seurat_clusters == 25] <- 'B_m'

In [None]:
Idents(obj) <- 'clusters_all'

levels(obj) <- c('B_1', 'B_2', 'B_3', 'B_4', 'B_ifn', 'B_m', 'GCB', 'Plasma',
                'pDC', 'Macrophage', 
                'Tn_1', 'Tn_2', 'CD4m_1', 'CD4m_2', 'Treg_1','Treg_2', 'Treg_3', 'Treg_4',
                 'T_ifn',
                 'CD8m_1', 'CD8m_2', 'T_exh_1', 'T_exh_2', 'NK-T', 'Proliferative')

In [None]:
set_figsize(13,10)
fig(DimPlot(obj, reduction = "umap", label=TRUE, label.size=6), 
    "Cluster_UMAP")

In [None]:
saveRDS(obj, file = './rpca_integrated.rds')