Analysis: Vladyslav Kavaka (vladyslav.kavaka@med.uni-muenchen.de), Eduardo Beltran (eduardo.beltran@med.uni-muenchen.de)
Insitute of Clinical Neuroimmunology, LMU, Munich

In [7]:
sessionInfo()
set.seed(1)

R version 4.0.5 (2021-03-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 18.04.6 LTS

Matrix products: default
BLAS/LAPACK: /home/INIM/vladyslav.kavaka/miniconda3/envs/azimuth/lib/libopenblasp-r0.3.17.so

locale:
 [1] LC_CTYPE=C.UTF-8    LC_NUMERIC=C        LC_TIME=C          
 [4] LC_COLLATE=C        LC_MONETARY=C       LC_MESSAGES=C      
 [7] LC_PAPER=C          LC_NAME=C           LC_ADDRESS=C       
[10] LC_TELEPHONE=C      LC_MEASUREMENT=C    LC_IDENTIFICATION=C

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] enrichR_3.0                 qpcR_1.4-1                 
 [3] robustbase_0.93-9           rgl_0.108.3                
 [5] minpack.lm_1.2-1            MASS_7.3-55                
 [7] MAST_1.16.0                 SingleCellExperiment_1.12.0
 [9] SummarizedExperiment_1.20.0 Biobase_2.50.0             
[11] GenomicRanges_1.42.0        GenomeInfoDb_1.26.4 

In [None]:
library(devtools)
library(Seurat)
library(dplyr)
library(Matrix)
library(tidyr)
library(limma)
library(ggplot2)
library(ggthemes)
library(patchwork)
library(gprofiler2)
library(ggrepel)
library(scales)
library(ggthemes)
library(purrr)
library(MAST)
library(qpcR)
library(enrichR)

In [5]:
set_figsize <- function(width, height){
    options(repr.plot.width = width, 
            repr.plot.height = height)
}

In [6]:
options(repr.plot.width=11, repr.plot.height=11)

# Read 10X data

In [None]:
## Read 10X data:
matrix_dir = "/pathway_to_combined_10X_output/"

In [None]:
list.files(matrix_dir)

In [None]:
data <- Read10X(data.dir = matrix_dir)
## Create Seurat object
pbmc <- CreateSeuratObject (counts = data, min.cells = 3, min.features = 200, project = "TWINS")

In [None]:
pbmc

In [None]:
# The number of features and UMIs (nFeature_RNA and nCount_RNA) are automatically calculated for every object by Seurat.
# For non-UMI data, nCount_RNA represents the sum of the non-normalized values within a cell
# We calculate the percentage of mitochondrial features here and store it in object metadata as `percent.mito`.
# We use raw count data since this represents non-transformed and non-log-normalized counts
# The % of UMI mapping to MT-features is a common scRNA-seq QC metric.
mito.features <- grep(pattern = "^MT-", x = rownames(x = pbmc), value = TRUE)
percent.mito <- Matrix::colSums(x = GetAssayData(object = pbmc, slot = 'counts')[mito.features, ]) / Matrix::colSums(x = GetAssayData(object = pbmc, slot = 'counts'))

In [None]:
# The [[ operator can add columns to object metadata, and is a great place to stash QC stats
pbmc[['percent.mito']] <- percent.mito
VlnPlot(object = pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mito"), ncol = 3, pt.size = 0.000001)

In [None]:
# FeatureScatter is typically used to visualize feature-feature relationships, but can be used for anything 
# calculated by the object, i.e. columns in object metadata, PC scores etc.
# Since there is a rare subset of cells with an outlier level of high mitochondrial percentage
# and also low UMI content, we filter these as well
FeatureScatter(object = pbmc, feature1 = "nCount_RNA", feature2 = "percent.mito")

In [None]:
FeatureScatter(object = pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

In [None]:
# We filter out cells that have unique feature counts over 5,000 or less than 500, mitochondrial genes percentage cutoff 15%
pbmc <- subset(x = pbmc, subset = nFeature_RNA > 500 & nFeature_RNA < 5000 & percent.mito < '0.15')

In [None]:
pbmc

In [None]:
VlnPlot(object = pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mito"), ncol = 3, pt.size = 0.00001)

## Create sample column

In [None]:
# Get batches based on cell names
samples_batches <- sapply(colnames(GetAssayData(object = pbmc, slot = "counts")),
                      FUN=function(x){substr(x,18,19)})

In [None]:
# Turn to numbers and add cell names to them
samples_batches <- as.numeric(as.character(samples_batches))
names(samples_batches) <- colnames(GetAssayData(object = pbmc, slot = "counts"))

In [None]:
sample.effect <- samples_batches

In [None]:
pbmc <- AddMetaData(pbmc, sample.effect, "sample.effect")

## Cluster and visualize cells using the usual scRNA-seq workflow, and examine for the potential presence of batch effects.

In [None]:
pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", scale.factor = 1e4)

In [None]:
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
length(x = VariableFeatures(object = pbmc))
markers.remove <- grep(pattern = "^TRAV|^TRBV|^TRGV|^TRDV",  x = rownames(x = pbmc), value = TRUE)
VariableFeatures(object = pbmc) <- VariableFeatures(object = pbmc)[!(VariableFeatures(object = pbmc)%in%markers.remove)]
length(VariableFeatures(object = pbmc))

In [None]:
pbmc <- ScaleData(pbmc, features = VariableFeatures(pbmc))
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))

DimPlot(pbmc)

In [None]:
# ProjectDim scores each feature in the dataset (including features not included in the PCA) based on their correlation 
# with the calculated components. Though we don't use this further here, it can be used to identify markers that 
# are strongly correlated with cellular heterogeneity, but may not have passed through variable feature selection. 
# The results of the projected PCA can be explored by setting `projected = TRUE`in the functions above
pbmc <- ProjectDim(object = pbmc)

In [None]:
ElbowPlot(object = pbmc)

In [None]:
for (i in 1:30){
    print(DimHeatmap(object = pbmc, dims = i, cells = 500, balanced = TRUE))
}

## Cluster the cells

In [None]:
pbmc <- FindNeighbors(object = pbmc, dims = 1:20)
pbmc <- FindClusters(object = pbmc, resolution = 0.5)

## Run UMAP

In [None]:
pbmc <- RunUMAP(pbmc, dims = 1:20)

In [None]:
#umap, dims 1:20, res. 0.5
DimPlot(pbmc, reduction = 'umap', label = TRUE)
DimPlot(pbmc, reduction = 'umap', label = TRUE, group.by = 'sample.effect')

Batch effect present, integration required

# Integration

In [None]:
Sys.time()
pbmc.list <- SplitObject(pbmc, split.by = "sample.effect")
pbmc.list

In [None]:
markers.remove <- grep(pattern = "^TRAV|^TRBV|^TRGV|^TRDV",  x = rownames(x = pbmc), value = TRUE)

In [None]:
pbmc.list <- lapply(X = pbmc.list, FUN = function(x) {
    x <- NormalizeData(x)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 5000)
})

for (i in 1:length(pbmc.list)){
     VariableFeatures(pbmc.list[[i]]) <- VariableFeatures(object = pbmc.list[[i]])[!(VariableFeatures(object = pbmc.list[[i]])%in%markers.remove)]
}
Sys.time()

In [None]:
pbmc.list

In [None]:
features <- SelectIntegrationFeatures(object.list = pbmc.list, nfeatures = 5000)

In [None]:
Sys.time()

In [None]:
pbmc.list <- lapply(X = pbmc.list, FUN = function(x) {
    x <- ScaleData(x, features = features, verbose = FALSE)
    x <- RunPCA(x, features = features, verbose = FALSE)
})
Sys.time()

In [None]:
pbmc.anchors <- FindIntegrationAnchors(object.list = pbmc.list, anchor.features = features, reduction = "rpca")
Sys.time()

In [None]:
pbmc.integrated <- IntegrateData(anchorset = pbmc.anchors)
Sys.time()

In [None]:
pbmc.integrated

# Working with integrated file

In [None]:
pbmc.integrated <- FindVariableFeatures(pbmc.integrated, selection.method = "vst", nfeatures = 2000)
length(x = VariableFeatures(object = pbmc.integrated))
markers.remove <- grep(pattern = "^TRAV|^TRBV|^TRGV|^TRDV",  x = rownames(x = pbmc.integrated), value = TRUE)
VariableFeatures(object = pbmc.integrated) <- VariableFeatures(object = pbmc.integrated)[!(VariableFeatures(object = pbmc.integrated)%in%markers.remove)]
length(VariableFeatures(object = pbmc.integrated))

In [None]:
#vst features
top10 <- head(VariableFeatures(pbmc.integrated), 20)
plot1 <- VariableFeaturePlot(pbmc.integrated)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot2

In [None]:
pbmc.integrated <- ScaleData(pbmc.integrated, features = VariableFeatures(object = pbmc.integrated), vars.to.regress = c("nCount_RNA", "percent.mito"))
pbmc.integrated <- RunPCA(pbmc.integrated, features = VariableFeatures(object = pbmc.integrated))

In [None]:
Idents(pbmc.integrated) <- 'sample.effect'
DimPlot(pbmc.integrated, reduction = 'pca', label = TRUE)

In [None]:
ElbowPlot(object = pbmc.integrated, ndims = 40)

In [None]:
for (i in 1:30){
    print(DimHeatmap(object = pbmc.integrated, dims = i, cells = 500, balanced = TRUE))
}

In [None]:
saveRDS(pbmc.integrated, file = './pbmc_integrated_withoutumap.rds')
saveRDS(pbmc, file = './pbmc_unintegrated.rds')

## Cluster the cells

In [None]:
pbmc.integrated <- FindNeighbors(object = pbmc.integrated, dims = 1:25)

In [None]:
pbmc.integrated <- FindClusters(object = pbmc.integrated, resolution = 0.5)

## Run UMAP

In [None]:
pbmc.integrated <- RunUMAP(pbmc.integrated, dims = 1:25)

In [None]:
DimPlot(pbmc.integrated, reduction = 'umap', label = TRUE)
DimPlot(pbmc.integrated, reduction = 'umap', label = TRUE, group.by = 'sample.effect')

In [None]:
DefaultAssay(pbmc.integrated) <- 'RNA'

# Adding the sample information

In [None]:
pbmc <- pbmc.integrated

In [None]:
samplenames <- read.csv2(file = './Sample_info.csv')
samplenames$Sample <- as.character(samplenames$Sample)

In [None]:
for(i in 1:nrow(pbmc@meta.data)){
    pbmc@meta.data$sample[i] <- filter(samplenames, Number == pbmc@meta.data$sample.effect[i])$Sample
}

# Adding TCR information

In [None]:
# 1-16 folder for the TCRs of samples 1-16
# 17-32 folder for the TCRs of samples 17-32

In [None]:
samples <- unique(pbmc@meta.data$sample)
samples

In [None]:
tcr_folder = 'folder for the TCRs of samples 1-16'

In [None]:
#First we need to create TCR file for the first sample in the list
tcr <- read.csv(paste(tcr_folder, 'TR_', samples[1], '/outs/', "filtered_contig_annotations.csv", sep=""))
tcr <- with(tcr, tcr[order(chain, decreasing = TRUE), ]) # place TRB on top before removing duplicates
tcr <- tcr[!duplicated(tcr$barcode), ]
#choose the columns to keep
tcr <- tcr[,c("barcode", "raw_clonotype_id", "chain", 'v_gene')]
names(tcr)[names(tcr) == "raw_clonotype_id"] <- "clonotype_id"
#read clonotypes file
clono <- read.csv(paste(tcr_folder, 'TR_', samples[1], '/outs/', "clonotypes.csv", sep=""))
tcr <- merge(tcr, clono[, c("clonotype_id", "frequency", "cdr3s_aa")])
#Rename columns
names(tcr)[1] <- "TCR_clonotype_id"
names(tcr)[3] <- 'TCR_chain'
names(tcr)[4] <- 'TCR_v_gene'
names(tcr)[5] <- 'TCR_frequency'
names(tcr)[6] <- 'TCR_cdr3'
#reorder Columns
tcr <- tcr[, c(2, 1, 3, 4, 5, 6)]
#correct rownames
rownames(tcr) <- tcr[,1]
tcr[,1] <- NULL
#Split cdr3 column:
tcr <- separate(data = tcr, col = TCR_cdr3, into = c("TCR1", "TCR2", "TCR3", "TCR4"), sep = "\\;")
tcr[is.na(tcr)] <- "FALSE"

head(tcr)
tcr.combined <- tcr

In [None]:
#Now we can start a loop for sample 2 till 16:
for (i in 2:16){
    tcr <- read.csv(paste(tcr_folder, 'TR_', samples[i], '/outs/', "filtered_contig_annotations.csv", sep=""))
    #change barcode numbers according to samples in the loop
    tcr$barcode <- gsub("-1", paste('-', i, sep = ''), tcr$barcode) 
    tcr <- with(tcr, tcr[order(chain, decreasing = TRUE), ]) # place TRB on top before removing duplicates
    tcr <- tcr[!duplicated(tcr$barcode), ]
    #choose the columns to keep
    tcr <- tcr[,c("barcode", "raw_clonotype_id", "chain", 'v_gene')]
    names(tcr)[names(tcr) == "raw_clonotype_id"] <- "clonotype_id"
    #read clonotypes file
    clono <- read.csv(paste(tcr_folder, 'TR_', samples[i], '/outs/', "clonotypes.csv", sep=""))
    tcr <- merge(tcr, clono[, c("clonotype_id", "frequency", "cdr3s_aa")])
    #Rename columns
    names(tcr)[1] <- "TCR_clonotype_id"
    names(tcr)[3] <- 'TCR_chain'
    names(tcr)[4] <- 'TCR_v_gene'
    names(tcr)[5] <- 'TCR_frequency'
    names(tcr)[6] <- 'TCR_cdr3'
    #reorder Columns
    tcr <- tcr[, c(2, 1, 3, 4, 5, 6)]
    #correct rownames
    rownames(tcr) <- tcr[,1]
    tcr[,1] <- NULL
    #Split cdr3 column:
    tcr <- separate(data = tcr, col = TCR_cdr3, into = c("TCR1", "TCR2", "TCR3", "TCR4"), sep = "\\;")
    tcr[is.na(tcr)] <- "FALSE"
    tcr.combined <- rbind(tcr.combined, tcr)
}

In [None]:
#for samples 17 till 32:
tcr_folder = 'folder for the TCRs of samples 17-32'
#Now we can start a loop for sample 17 till 32:
for (i in 17:length(samples)){
    tcr <- read.csv(paste(tcr_folder, 'TR_', samples[i], '/outs/', "filtered_contig_annotations.csv", sep=""))
    #change barcode numbers according to samples in the loop
    tcr$barcode <- gsub("-1", paste('-', i, sep = ''), tcr$barcode) 
    tcr <- with(tcr, tcr[order(chain, decreasing = TRUE), ]) # place TRB on top before removing duplicates
    tcr <- tcr[!duplicated(tcr$barcode), ]
    #choose the columns to keep
    tcr <- tcr[,c("barcode", "raw_clonotype_id", "chain", 'v_gene')]
    names(tcr)[names(tcr) == "raw_clonotype_id"] <- "clonotype_id"
    #read clonotypes file
    clono <- read.csv(paste(tcr_folder, 'TR_', samples[i], '/outs/', "clonotypes.csv", sep=""))
    tcr <- merge(tcr, clono[, c("clonotype_id", "frequency", "cdr3s_aa")])
    #Rename columns
    names(tcr)[1] <- "TCR_clonotype_id"
    names(tcr)[3] <- 'TCR_chain'
    names(tcr)[4] <- 'TCR_v_gene'
    names(tcr)[5] <- 'TCR_frequency'
    names(tcr)[6] <- 'TCR_cdr3'
    #reorder Columns
    tcr <- tcr[, c(2, 1, 3, 4, 5, 6)]
    #correct rownames
    rownames(tcr) <- tcr[,1]
    tcr[,1] <- NULL
    #Split cdr3 column:
    tcr <- separate(data = tcr, col = TCR_cdr3, into = c("TCR1", "TCR2", "TCR3", "TCR4"), sep = "\\;")
    tcr[is.na(tcr)] <- "FALSE"
    tcr.combined <- rbind(tcr.combined, tcr)
}

In [None]:
#divide in TRA and TRB subset:
for (k in 1:nrow(tcr.combined)){
  if(startsWith(tcr.combined$TCR1[k], 'TRB:')){
    tcr.combined$TCR1B[k] <- sub(pattern = '.*:', x = tcr.combined$TCR1[k], '')
  } else {tcr.combined$TCR1B[k] <- 'FALSE'}
    if(startsWith(tcr.combined$TCR1[k], 'TRA:')){
    tcr.combined$TCR1A[k] <- sub(pattern = '.*:', x = tcr.combined$TCR1[k], '')
  } else {tcr.combined$TCR1A[k] <- 'FALSE'}
  
  if(startsWith(tcr.combined$TCR2[k], 'TRB:')){
    tcr.combined$TCR2B[k] <- sub(pattern = '.*:', x = tcr.combined$TCR2[k], '')
  } else {tcr.combined$TCR2B[k] <- 'FALSE'}
        if(startsWith(tcr.combined$TCR2[k], 'TRA:')){
    tcr.combined$TCR2A[k] <- sub(pattern = '.*:', x = tcr.combined$TCR2[k], '')
  } else {tcr.combined$TCR2A[k] <- 'FALSE'}
  
  if(startsWith(tcr.combined$TCR3[k], 'TRB:')){
    tcr.combined$TCR3B[k] <- sub(pattern = '.*:', x = tcr.combined$TCR3[k], '')
  } else {tcr.combined$TCR3B[k] <- 'FALSE'}
     if(startsWith(tcr.combined$TCR3[k], 'TRA:')){
    tcr.combined$TCR3A[k] <- sub(pattern = '.*:', x = tcr.combined$TCR3[k], '')
  } else {tcr.combined$TCR3A[k] <- 'FALSE'}
  
  if(startsWith(tcr.combined$TCR4[k], 'TRB:')){
    tcr.combined$TCR4B[k] <- sub(pattern = '.*:', x = tcr.combined$TCR4[k], '')
  } else {tcr.combined$TCR4B[k] <- 'FALSE'}
    if(startsWith(tcr.combined$TCR4[k], 'TRA:')){
    tcr.combined$TCR4A[k] <- sub(pattern = '.*:', x = tcr.combined$TCR4[k], '')
  } else {tcr.combined$TCR4A[k] <- 'FALSE'}
}

In [None]:
tcr.combined$TCR1 <- NULL
tcr.combined$TCR2 <- NULL
tcr.combined$TCR3 <- NULL
tcr.combined$TCR4 <- NULL
head(tcr.combined)
tail(tcr.combined)

In [None]:
write.csv(tcr.combined, file = './tcr_pbmc_all.csv')

In [None]:
pbmc <- AddMetaData(object = pbmc, metadata = tcr.combined)

In [None]:
md = pbmc@meta.data # First, let's get the meta data
i <- sapply(md, is.factor) # Identify all factor variables in your data
md[i] <- lapply(md[i], as.character) # Convert factors to character variables
md[is.na(md)] <- "FALSE" # Replace NA with "FALSE"
md[i] <- lapply(md[i], as.factor) # Convert character columns back to factors
pbmc@meta.data = md #Insert it back

In [None]:
DimPlot(pbmc, reduction = 'umap', label = TRUE)
DimPlot(pbmc, reduction = 'umap', group.by = 'TCR_frequency', label = TRUE) + NoLegend()
DimPlot(pbmc, reduction = 'umap', group.by = 'TCR_chain', label = TRUE)

## TCR available column

In [None]:
for (i in 1:nrow(pbmc@meta.data)){
    if(pbmc@meta.data$TCR_frequency[i] == 'FALSE'){
        pbmc@meta.data$V6[i] <- 'FALSE'
    } else {pbmc@meta.data$V6[i] <- 'CD8'}
}

## Correct frequency column after pre-processing

In [None]:
#subset only T cells
Idents(pbmc) <- 'V6'
tcells <- subset(pbmc, idents = 'CD8')
tcells

In [None]:
# create a column containing sample effect and clonotype:
for (i in 1:dim(tcells@meta.data)[1]){
        tcells@meta.data$Sample_Clono[i] <- paste(tcells$sample.effect[i], tcells$TCR_clonotype_id[i], sep = '_')
}
for (i in 1:nrow(tcells@meta.data)){
    tcells@meta.data$TCR_frequency_corrected[i] <- sum(tcells@meta.data$Sample_Clono == tcells@meta.data$Sample_Clono[i])
}
tcells@meta.data$TCR_frequency_corrected <- as.numeric(tcells@meta.data$TCR_frequency_corrected)
class(tcells@meta.data$TCR_frequency_corrected)


for (i in 1:dim(tcells@meta.data)[1]){
    if (tcells@meta.data$TCR_frequency_corrected[i] > 2){
        tcells@meta.data$TCR_Clono[i] <- paste(tcells$sample.effect[i], tcells$TCR_clonotype_id[i], tcells$TCR_frequency_corrected[i], sep = '_')
    } else {tcells@meta.data$TCR_Clono[i] <- tcells@meta.data$TCR_frequency_corrected[i]}
}

## Adding expanded column

In [None]:
#add expand column
for (i in 1:nrow(tcells@meta.data)){
    if(tcells@meta.data$TCR_frequency_corrected[i] > 2){
        tcells@meta.data$expand[i] <- 'exp'
    } else {tcells@meta.data$expand[i] <- 'nonexp'}
}

In [None]:
DimPlot(tcells, reduction = 'umap', group.by = 'TCR_frequency_corrected', label = TRUE) + NoLegend()
DimPlot(tcells, reduction = 'umap', group.by = 'expand', label = TRUE)

In [None]:
saveRDS(pbmc, file = './pbmc_with_tcr_20210708.rds')

In [None]:
saveRDS(tcells, file = './tcells_20210708.rds')

In [None]:
write.csv(tcells@meta.data, file = './tcells_metadata_08082021.csv')
write.csv(pbmc@meta.data, file = './pbmc_metadata_08082021.csv')

# Remove doublets T cells and T cells without beta chain

In [None]:
pbmc <- tcells

In [None]:
#let's find t cell doublets:
for(i in 1:nrow(pbmc@meta.data)){
         pbmc@meta.data$tcr_b_sum[i] <- sum(pbmc@meta.data[i, c('TCR1B', 'TCR2B', 'TCR3B', 'TCR4B')] != 'FALSE')
    }

In [None]:
table(pbmc@meta.data$tcr_b_sum)

In [None]:
#save the t doublets: 
write.csv(filter(pbmc@meta.data, tcr_b_sum > 1), file = './cd8_tcelldoublets.csv')
#save cells where only alfa chain was found:
write.csv(filter(pbmc@meta.data, tcr_b_sum == 0), file = './cd8_tcell_no_betachain.csv')
tcellstoremove <- c(rownames(filter(pbmc@meta.data, tcr_b_sum > 1)), rownames(filter(pbmc@meta.data, tcr_b_sum == 0)))

In [None]:
length(tcellstoremove)

In [None]:
#subset everything without doublets or cells without beta chain:
pbmc
pbmc <- subset(pbmc, cells = tcellstoremove, invert = TRUE)
pbmc

# First reclustering

In [None]:
DefaultAssay(pbmc) <- 'integrated'
pbmc

In [None]:
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
length(x = VariableFeatures(object = pbmc))
markers.remove <- grep(pattern = "^TRAV|^TRBV|^TRGV|^TRDV",  x = rownames(x = pbmc), value = TRUE)
VariableFeatures(object = pbmc) <- VariableFeatures(object = pbmc)[!(VariableFeatures(object = pbmc)%in%markers.remove)]
length(VariableFeatures(object = pbmc))

In [None]:
top10 <- head(VariableFeatures(pbmc), 20)
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot2

In [None]:
pbmc <- ScaleData(pbmc, features = VariableFeatures(object = pbmc), vars.to.regress = c("nCount_RNA", "percent.mito"))
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))

In [None]:
Idents(pbmc) <- 'sample.effect'
DimPlot(pbmc, reduction = 'pca', label = TRUE)

In [None]:
ElbowPlot(object = pbmc, ndims = 50)

In [None]:
for (i in 1:30){
    print(DimHeatmap(object = pbmc, dims = i, cells = 500, balanced = TRUE))
}

## Cluster the cells

In [None]:
DefaultAssay(pbmc) <- 'integrated'

In [None]:
pbmc <- FindNeighbors(object = pbmc, dims = 1:25)

In [None]:
pbmc <- FindClusters(object = pbmc, resolution = 0.6)

## Run UMAP

In [None]:
pbmc <- RunUMAP(pbmc, dims = 1:25)

In [None]:
#umap, dims 25, res. 0.8
DimPlot(pbmc, reduction = 'umap', label = TRUE)
DimPlot(pbmc, reduction = 'umap', label = TRUE, group.by = 'sample.effect')

In [None]:
VlnPlot(pbmc, features = 'nFeature_RNA', pt.size = 0.01)

## Cluster markers

In [None]:
DefaultAssay(pbmc) <- 'RNA'

In [None]:
Idents(pbmc) <- 'seurat_clusters'
featurespbmc <- rownames(pbmc)
markers.remove <- grep(pattern = "^TRAV|^TRBV|^TRGV|^TRDV|^RPL|^RPS", x = rownames(pbmc), value = TRUE)
featurespbmc <- featurespbmc[!(featurespbmc%in%markers.remove)]
pbmc.markers1 <- FindAllMarkers(object = pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25, features = featurespbmc)

#sort markers (top 30)
pbmc.markers_sorted <- c()
for (i in 1:length(levels(pbmc.markers1$cluster))){
    pbmc.markers_level <- filter(pbmc.markers1, cluster == levels(pbmc.markers1$cluster)[i])
    pbmc.markers_level <- pbmc.markers_level[order(-pbmc.markers_level$avg_log2FC), ]
    pbmc.markers_level <- pbmc.markers_level[1:30, ]
    pbmc.markers_level <- pbmc.markers_level[!is.na(pbmc.markers_level$avg_log2FC), ]
    pbmc.markers_sorted <- rbind(pbmc.markers_sorted, pbmc.markers_level)
    }
pbmc.markers_sorted_top30 <- pbmc.markers_sorted
pbmc.markers_sorted_top30
write.csv(pbmc.markers_sorted_top30, file = './cd8_firstmarkers.csv')

In [None]:
#sort markers (top 5) and plot the heatmap
cluster.averages_pbmc <- AverageExpression(pbmc, assay = "RNA", return.seurat = TRUE) # , verbose = FALSE)

pbmc.markers_sorted <- c()
for (i in 1:length(levels(pbmc.markers1$cluster))){
    pbmc.markers_level <- filter(pbmc.markers1, cluster == levels(pbmc.markers1$cluster)[i])
    pbmc.markers_level <- pbmc.markers_level[order(-pbmc.markers_level$avg_log2FC), ]
    pbmc.markers_level <- pbmc.markers_level[1:5, ]
    pbmc.markers_level <- pbmc.markers_level[!is.na(pbmc.markers_level$avg_log2FC), ]
    pbmc.markers_sorted <- rbind(pbmc.markers_sorted, pbmc.markers_level)
    }
pbmc.markers_sorted_top5 <- pbmc.markers_sorted
pbmc.markers_sorted_top5

DoHeatmap(cluster.averages_pbmc, features = pbmc.markers_sorted_top5$gene)

In [None]:
VlnPlot(pbmc, features = 'nFeature_RNA', pt.size = 0.01)

In [None]:
# cluster 18 - myeloid cells, cluster 8 and 10 - low quality cell, 16, 17 - mixed
cellstoremove <- WhichCells(pbmc, idents = c(8, 10, 16, 17, 18))
length(cellstoremove)

In [None]:
write.csv(pbmc@meta.data[cellstoremove, ], file = './removed_clusters.csv')

In [None]:
pbmc <- subset(pbmc, cells = cellstoremove, invert = TRUE)
pbmc

# Second reclustering

In [None]:
DefaultAssay(pbmc) <- 'integrated'
pbmc

In [None]:
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
length(x = VariableFeatures(object = pbmc))
markers.remove <- grep(pattern = "^TRAV|^TRBV|^TRGV|^TRDV|^RPL|^RPS",  x = rownames(x = pbmc), value = TRUE)
VariableFeatures(object = pbmc) <- VariableFeatures(object = pbmc)[!(VariableFeatures(object = pbmc)%in%markers.remove)]
length(VariableFeatures(object = pbmc))

In [None]:
top10 <- head(VariableFeatures(pbmc), 20)
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot2

In [None]:
pbmc <- ScaleData(pbmc, features = VariableFeatures(object = pbmc), vars.to.regress = c("nCount_RNA", "percent.mito"))
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))

In [None]:
Idents(pbmc) <- 'sample.effect'
DimPlot(pbmc, reduction = 'pca', label = TRUE)

In [None]:
# ProjectDim scores each feature in the dataset (including features not included in the PCA) based on their correlation 
# with the calculated components. Though we don't use this further here, it can be used to identify markers that 
# are strongly correlated with cellular heterogeneity, but may not have passed through variable feature selection. 
# The results of the projected PCA can be explored by setting `projected = TRUE`in the functions above
pbmc <- ProjectDim(object = pbmc)

In [None]:
ElbowPlot(object = pbmc, ndims = 50)

In [None]:
for(i in 1:40){   
    print(DimHeatmap(object = pbmc, dims = i, cells = 500, balanced = TRUE))
}

## Cluster the cells 

In [None]:
DefaultAssay(pbmc) <- 'integrated'
pbmc

In [None]:
pbmc <- FindNeighbors(object = pbmc, dims = 1:20)

In [None]:
pbmc <- FindClusters(object = pbmc, resolution = 0.8)

## Run UMAP

In [None]:
pbmc <- RunUMAP(pbmc, dims = 1:20)

In [None]:
DimPlot(pbmc, reduction = 'umap', label = TRUE)
DimPlot(pbmc, reduction = 'umap', label = TRUE, group.by = 'sample.effect')

In [None]:
saveRDS(pbmc, file = 'cd8_withumap_20210810.rds')

## Cluster markers

In [None]:
DefaultAssay(pbmc) <- 'RNA'

In [None]:
featurespbmc <- rownames(pbmc)
markers.remove <- grep(pattern = "^TRAV|^TRBV|^TRGV|^TRDV", x = rownames(pbmc), value = TRUE)
featurespbmc <- featurespbmc[!(featurespbmc%in%markers.remove)]
pbmc.markers1 <- FindAllMarkers(object = pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25, features = featurespbmc)

#sort markers (top 30)
pbmc.markers_sorted <- c()
for (i in 1:length(levels(pbmc.markers1$cluster))){
    pbmc.markers_level <- filter(pbmc.markers1, cluster == levels(pbmc.markers1$cluster)[i])
    pbmc.markers_level <- pbmc.markers_level[order(-pbmc.markers_level$avg_log2FC), ]
    pbmc.markers_level <- pbmc.markers_level[1:30, ]
    pbmc.markers_level <- pbmc.markers_level[!is.na(pbmc.markers_level$avg_log2FC), ]
    pbmc.markers_sorted <- rbind(pbmc.markers_sorted, pbmc.markers_level)
    }
pbmc.markers_sorted_top30 <- pbmc.markers_sorted
pbmc.markers_sorted_top30
write.csv(pbmc.markers_sorted_top30, file = './cd8_secondmarkers.csv')

In [None]:
cluster.averages_pbmc <- AverageExpression(pbmc, assay = "RNA", return.seurat = TRUE) # , verbose = FALSE)
#sort markers (top 5)
pbmc.markers_sorted <- c()
for (i in 1:length(levels(pbmc.markers1$cluster))){
    pbmc.markers_level <- filter(pbmc.markers1, cluster == levels(pbmc.markers1$cluster)[i])
    pbmc.markers_level <- pbmc.markers_level[order(-pbmc.markers_level$avg_log2FC), ]
    pbmc.markers_level <- pbmc.markers_level[1:5, ]
    pbmc.markers_level <- pbmc.markers_level[!is.na(pbmc.markers_level$avg_log2FC), ]
    pbmc.markers_sorted <- rbind(pbmc.markers_sorted, pbmc.markers_level)
    }
pbmc.markers_sorted_top5 <- pbmc.markers_sorted
pbmc.markers_sorted_top5

DoHeatmap(cluster.averages_pbmc, features = pbmc.markers_sorted_top5$gene)

In [None]:
VlnPlot(pbmc, features = 'nFeature_RNA', pt.size = 0.01)

In [None]:
#remove cluster 12 as lowQC and mostly ribosomal genes

cluster12 <- filter(pbmc@meta.data, seurat_clusters == 12)
write.csv(cluster12, file = './ribosomalcluster.csv')
pbmc_old <- pbmc

Idents(pbmc) <- 'seurat_clusters'
pbmc
pbmc <- subset(pbmc, idents = 12, invert = TRUE)
pbmc

## Rename clusters

In [None]:
# 11  - most_naiive
# 6 - im_naiive
# 7 - CCR4_naiive
# 8 - NT5E_naiive
# 10 - CD82_pos
# 9 - NK-like
# 0 - MAIT
# 1 - im_infl_1
# 4 - im_infl_2
# 13 - IFN_sign
# combine 14, 2, 5 because of the overall similarity of the dge, expansion and almost no unique patterns - eff_1
# 3 - eff_2
for(i in 1:nrow(pbmc@meta.data)){
    if(pbmc@meta.data$seurat_clusters[i] == 11){pbmc@meta.data$cd8names[i] <- 'most_naiive'}
    if(pbmc@meta.data$seurat_clusters[i] == 6){pbmc@meta.data$cd8names[i] <- 'im_naiive'}
    if(pbmc@meta.data$seurat_clusters[i] == 8){pbmc@meta.data$cd8names[i] <- 'NT5E_naiive'}
    if(pbmc@meta.data$seurat_clusters[i] == 7){pbmc@meta.data$cd8names[i] <- 'CCR4_naiive'}
    if(pbmc@meta.data$seurat_clusters[i] == 10){pbmc@meta.data$cd8names[i] <- 'CD82_pos'}
    if(pbmc@meta.data$seurat_clusters[i] == 1){pbmc@meta.data$cd8names[i] <- 'im_infl_1'}
    if(pbmc@meta.data$seurat_clusters[i] == 4){pbmc@meta.data$cd8names[i] <- 'im_infl_2'}
    if(pbmc@meta.data$seurat_clusters[i] == 13){pbmc@meta.data$cd8names[i] <- 'IFN_sign'}
    if(pbmc@meta.data$seurat_clusters[i] == 14 | pbmc@meta.data$seurat_clusters[i] == 2 | pbmc@meta.data$seurat_clusters[i] == 5){pbmc@meta.data$cd8names[i] <- 'eff_1'}
    if(pbmc@meta.data$seurat_clusters[i] == 3){pbmc@meta.data$cd8names[i] <- 'eff_2'}
    if(pbmc@meta.data$seurat_clusters[i] == 0){pbmc@meta.data$cd8names[i] <- 'MAIT'}
    if(pbmc@meta.data$seurat_clusters[i] == 9){pbmc@meta.data$cd8names[i] <- 'NK-like'}
}

unique(pbmc@meta.data$cd8names)

In [None]:
pbmc@meta.data$cd8_coded <- pbmc@meta.data$cd8names
#rename the clusters 
for (i in 1:nrow(pbmc@meta.data)){
    if(pbmc@meta.data$cd8names[i] == 'most_naiive'){
        pbmc@meta.data$cd8_coded[i] <- '1_CCR7'
    }
    if(pbmc@meta.data$cd8names[i] == 'im_naiive'){
        pbmc@meta.data$cd8_coded[i] <- '2_NELL2'
    }
    if(pbmc@meta.data$cd8names[i] == 'NT5E_naiive'){
        pbmc@meta.data$cd8_coded[i] <- '3_NT5E'
    }
    if(pbmc@meta.data$cd8names[i] == 'CD82_pos'){
        pbmc@meta.data$cd8_coded[i] <- '4_CD82'
    }
    if(pbmc@meta.data$cd8names[i] == 'CCR4_naiive'){
        pbmc@meta.data$cd8_coded[i] <- '5_MAL'
    }
    if(pbmc@meta.data$cd8names[i] == 'im_infl_1'){
        pbmc@meta.data$cd8_coded[i] <- '6_GZMK'
    }
    if(pbmc@meta.data$cd8names[i] == 'IFN_sign'){
        pbmc@meta.data$cd8_coded[i] <- '7_MX1'
    }
    if(pbmc@meta.data$cd8names[i] == 'im_infl_2'){
        pbmc@meta.data$cd8_coded[i] <- '8_CD74'
    }
    if(pbmc@meta.data$cd8names[i] == 'eff_2'){
        pbmc@meta.data$cd8_coded[i] <- '9_IKZF2'
    }
    if(pbmc@meta.data$cd8names[i] == 'eff_1'){
        pbmc@meta.data$cd8_coded[i] <- '10_FGFBP2'
    }
}

In [None]:
Idents(pbmc) <- 'cd8_coded'
levels(pbmc) <- c('NK-like', 'MAIT', '1_CCR7', '2_NELL2', '3_NT5E', '4_CD82', '5_MAL', '6_GZMK', '7_MX1', '8_CD74', '9_IKZF2', '10_FGFBP2')

# Final markers, UMAPS, plotting of clusters and signatures

In [None]:
#create the output dir
dir_plots <- paste0('./outs/')
dir.create(dir_plots)

In [None]:
width <- 15
height <- 12
name <- 'general_cd8_umap_raster'

options(repr.plot.width = width, repr.plot.height = height)
plot <- DimPlot(pbmc, reduction = "umap", label = TRUE, label.size = 9, repel = TRUE, raster = TRUE, pt.size = 0.01) + 
theme(text = element_text(size = 20),
      axis.text = element_text(size = 20),
      legend.text=element_text(size=18))
plot
ggsave(plot, file = paste0(dir_plots, name, '.pdf'), width = width, height = height)

In [None]:
#Find markers
featurespbmc <- rownames(pbmc)
markers.remove <- grep(pattern = "^TRAV|^TRBV|^TRGV|^TRDV|^RPL|^RPS", x = rownames(pbmc), value = TRUE)
featurespbmc <- featurespbmc[!(featurespbmc%in%markers.remove)]
pbmc.markers1 <- FindAllMarkers(object = pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25, features = featurespbmc)

#sort markers (top 30)
pbmc.markers_sorted <- c()
for (i in 1:length(levels(pbmc.markers1$cluster))){
    pbmc.markers_level <- filter(pbmc.markers1, cluster == levels(pbmc.markers1$cluster)[i])
    pbmc.markers_level <- pbmc.markers_level[order(-pbmc.markers_level$avg_log2FC), ]
    pbmc.markers_level <- pbmc.markers_level[1:30, ]
    pbmc.markers_level <- pbmc.markers_level[!is.na(pbmc.markers_level$avg_log2FC), ]
    pbmc.markers_sorted <- rbind(pbmc.markers_sorted, pbmc.markers_level)
    }
pbmc.markers_sorted_top30 <- pbmc.markers_sorted
write.csv(pbmc.markers_sorted_top30, file = './cd8_final_markers.csv')

#sort markers (top 5)
pbmc.markers_sorted <- c()
for (i in 1:length(levels(pbmc.markers1$cluster))){
    pbmc.markers_level <- filter(pbmc.markers1, cluster == levels(pbmc.markers1$cluster)[i])
    pbmc.markers_level <- pbmc.markers_level[order(-pbmc.markers_level$avg_log2FC), ]
    pbmc.markers_level <- pbmc.markers_level[1:5, ]
    pbmc.markers_level <- pbmc.markers_level[!is.na(pbmc.markers_level$avg_log2FC), ]
    pbmc.markers_sorted <- rbind(pbmc.markers_sorted, pbmc.markers_level)
    }
pbmc.markers_sorted_top5 <- pbmc.markers_sorted
pbmc.markers_sorted_top5

In [None]:
#Generate a dotplot with top5 markers per cluster
width <- 20
height <- 8
options(repr.plot.width = width, repr.plot.height = height)
object <- pbmc

levels(object) <- rev(levels(object))
plot <- DotPlot(object, features = unique(pbmc.markers_sorted_top5$gene), dot.scale = 10, cols = c('white', '#D3556E')) + RotatedAxis() +
        theme(
        text = element_text(size = 17),
        axis.text = element_text(size = 17),
        legend.text=element_text(size=17))
plot
ggsave(plot, file = './outs/dotplot_cd8.pdf', width = width, height = height)

In [None]:
#naiive signatures enrichment UMAP
naiive_markers <- c('CCR7', 'LEF1', 'MAL', 'MYC', 'SELL', 'LDLRAP1', 'LDHB', 'NOSIP', 'TCF7', 'PIK3IP1', 'NELL2')

width <- 11
height <- 11
name <- 'naiive_markers_umap'
options(repr.plot.width = width, repr.plot.height = height)
naiive_markers <- list(naiive_markers)
pbmc <- AddModuleScore(pbmc, features = naiive_markers, name = 'naiive_signature')

plot <- FeaturePlot(pbmc, features = 'naiive_signature1', label.size = 6, label = TRUE, pt.size = 5, raster = T, raster.dpi = c(2048, 2048), repel = TRUE, cols = c('lightgrey', '#D3556E'), min.cutoff = 'q75', order = T) + 
                theme(
                      text = element_text(size = 20),
                      axis.text = element_text(size = 20),
                      legend.text=element_text(size=18), 
                      plot.title = element_text(size = 15, face = 'plain'))+
        ggtitle(paste(naiive_markers[[1]], collapse = ', '))
plot
ggsave(plot, file = paste0(dir_plots, name, '.pdf'), width = width, height = height)

In [None]:
#effector signatures enrichment UMAP
effectors_markers <- c('CX3CR1', 'GNLY', 'GZMH', 'FGFBP2', 'FCGR3A', 'PLEK', 'ADGRG1', 'PRF1')
width <- 11
height <- 11
name <- 'effector_markers_umap'
options(repr.plot.width = width, repr.plot.height = height)
effectors_markers <- list(effectors_markers)
pbmc <- AddModuleScore(pbmc, features = effectors_markers, name = 'effectors_signature')

plot <- FeaturePlot(pbmc, features = 'effectors_signature1', label.size = 6, label = TRUE, pt.size = 5, raster = T, raster.dpi = c(2048, 2048), repel = TRUE, cols = c('lightgrey', '#D3556E'), min.cutoff = 'q75', order = T) + 
                theme(
                      text = element_text(size = 20),
                      axis.text = element_text(size = 20),
                      legend.text=element_text(size=18),
                    plot.title = element_text(size = 15, face = 'plain'))+
        ggtitle(paste(effectors_markers[[1]], collapse = ', '))
plot
ggsave(plot, file = paste0(dir_plots, name, '.pdf'), width = width, height = height)

In [None]:
#activation signatures enrichment UMAP

width <- 11
height <- 11
name <- 'activation_umap'
options(repr.plot.width = width, repr.plot.height = height)

hla_effectors_markers <- c('CD74', 'CMC1', 'GZMK', 'HLA-DRA', 'HLA-DQA1', 'HLA-DQB1', 'HLA-DRB5')

hla_effectors_markers <- list(hla_effectors_markers)
pbmc <- AddModuleScore(pbmc, features = hla_effectors_markers, name = 'hla_effectors_signature')


plot <- FeaturePlot(pbmc, features = 'hla_effectors_signature1', label.size = 6, label = TRUE, pt.size = 5, raster = T, raster.dpi = c(2048, 2048), repel = TRUE, cols = c('lightgrey', '#D3556E'), min.cutoff = 'q75', order = T) + 
                theme(
                      text = element_text(size = 20),
                      axis.text = element_text(size = 20),
                      legend.text=element_text(size=18),
                    plot.title = element_text(size = 15, face = 'plain'))+
        ggtitle(paste(hla_effectors_markers[[1]], collapse = ', '))
plot
ggsave(plot, file = paste0(dir_plots, name, '.pdf'), width = width, height = height)

In [None]:
object <- pbmc
levels(object) <- rev(levels(object))
object_av <- AverageExpression(object, assay = "RNA", return.seurat = TRUE) # , verbose = FALSE)

In [None]:
#print signatures as the heatmap

width <- 9
height <- 22

markers <- c('CCR7', 'LEF1', 'MAL', 'MYC', 'SELL', 'LDLRAP1', 'LDHB', 'NOSIP', 'TCF7', 'PIK3IP1', 'NELL2', 'CD74', 'CMC1', 'GZMK', 'HLA-DRA', 'HLA-DQA1', 'HLA-DQB1', 'HLA-DRB5',
            'CX3CR1', 'GNLY', 'GZMH', 'FGFBP2', 'FCGR3A', 'PLEK', 'ADGRG1', 'PRF1')
#create margins for heatmap color scale
data_markers <- object_av@assays$RNA@scale.data
data_markers <- data_markers[markers, ]
max.value <- max(data_markers)
min.value <- min(data_markers)

options(repr.plot.width = width, repr.plot.height = height)
hm <- DoHeatmap(cluster.averages_pbmc, features = markers, draw.lines = FALSE,size = 8, angle = 270, hjust = 1, raster = FALSE) +  
theme(text = element_text(size = 20, face = "plain", colour = 'black'),
             axis.text.y=element_text(colour="black", size = 18, angle = 320, vjust = 1)) + coord_equal() +
        scale_fill_gradientn(colours = c("#2881C1", "white", "#D3556E", "#671727"), values = scales::rescale(c(min.value, 0, max.value/2, max.value)))
ggsave(hm, file = './outs/heatmap_signatures.pdf', width = width, height = height)
hm

# Expansion overview

In [None]:
pbmc@meta.data$expand_new <- 'non-expanded'
pbmc@meta.data$expand_new[pbmc@meta.data$TCR_frequency_corrected > 2] <- 'expanded'
unique(pbmc@meta.data$expand_new)

In [None]:
#print a UMAP with expanded cells in red
width <- 12
height <- 11
options(repr.plot.width = width, repr.plot.height = height)
expanded_cells <- DimPlot(pbmc, reduction = "umap", label = TRUE, label.size = 8, repel = TRUE, group.by = 'expand_new', cols = c('#D3556E', 'lightgrey'), pt.size = 0.6) + 
theme(text = element_text(size = 20),
      axis.text = element_text(size = 20),
      legend.text=element_text(size=20))
expanded_cells
ggsave(expanded_cells, file = paste0(dir_plots, 'expanded_cells_umap.pdf'), width = width, height = height)

In [None]:
#generate a bar plot with expansion fractions
expansion <- c('non-expanded', 'expanded')
object <- pbmc
Idents(object) <- 'expand_new'
levels(object) <- expansion
levels_subgroups <- levels(pbmc)
object@meta.data$cluster_name <- object@meta.data$cd8_coded
object@meta.data$clusters <- object@meta.data$cluster_name
width <- 14
height <- 11
#explore the diagnosis of the cells
subgroups <- unique(object@meta.data$cluster_name)


results <- c()
for(i in 1:length(subgroups)){
    freq_subgroups <- data.frame(matrix(NA, ncol = length(levels_subgroups), nrow = length(expansion)))
    colnames(freq_subgroups) <- c('expand', 'absolute', 'relative', 'subgroups')
    freq_subgroups$expand <- expansion
    for(c in 1:nrow(freq_subgroups)){
        freq_subgroups$absolute[c] <- nrow(filter(object@meta.data, cluster_name == subgroups[i] & expand_new == freq_subgroups$expand[c]))
        freq_subgroups$relative[c] <- 100 * nrow(filter(object@meta.data, cluster_name == subgroups[i] & expand_new == freq_subgroups$expand[c])) /
                                    nrow(filter(object@meta.data, cluster_name == subgroups[i]))
    }
    freq_subgroups$subgroups <- subgroups[i]
    results <- rbind(results, freq_subgroups)
}
 results$subgroups <- factor(results$subgroups, levels = levels_subgroups)
results$clusters <- factor(results$expand, levels = expansion)
 write.csv(results, file = './outs/expanded_per_cluster_frequencies.csv')


options(repr.plot.width=width, repr.plot.height=height)
plot <- ggplot(results, aes(fill=clusters, y=relative, x=subgroups)) +
        geom_bar(position="stack", stat="identity", ) + theme(
        plot.title = element_text(hjust = 0.45),
        text = element_text(size=30),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.x = element_text(angle = 55, vjust = 1, hjust=1, colour = 'black')) + ylab('Fraction of expanded cells within the cluster')+ xlab('Samples') +
        scale_fill_manual('legend', values = c('lightgrey', '#D3556E'))
        #scale_fill_viridis(discrete = TRUE)
        #scale_fill_brewer(palette = "Paired")
    print(plot)
    ggsave(plot, file = paste0(dir_plots, 'expanded_bar_plot.pdf'), width = width, height = height)

In [None]:
# Numbers of expanded and non-expanded cells (cd8_coded):
expvsnonexp <- data.frame(matrix(NA, ncol = length(unique(pbmc@meta.data$cd8_coded)), nrow = 5))
colnames(expvsnonexp) <- unique(pbmc@meta.data$cd8_coded)
rownames(expvsnonexp) <- c('expanded abs', 'exp relative', 'non-expanded abs', 'non-exp relative', 'total')
for (i in 1:length(unique(pbmc@meta.data$cd8_coded))){
    exp <- nrow(filter(pbmc@meta.data, expand_new == 'expanded' & cd8_coded == unique(pbmc@meta.data$cd8_coded)[i]))
    exprel <- 100*nrow(filter(pbmc@meta.data, expand_new == 'expanded' & cd8_coded == unique(pbmc@meta.data$cd8_coded)[i])) / nrow(filter(pbmc@meta.data, cd8_coded == unique(pbmc@meta.data$cd8_coded)[i]))
    nonexp <- nrow(filter(pbmc@meta.data, expand_new == 'non-expanded' & cd8_coded == unique(pbmc@meta.data$cd8_coded)[i]))
    nonexprel <- 100*nrow(filter(pbmc@meta.data, expand_new == 'non-expanded' & cd8_coded == unique(pbmc@meta.data$cd8_coded)[i])) / nrow(filter(pbmc@meta.data, cd8_coded == unique(pbmc@meta.data$cd8_coded)[i]))
    total <- nrow(filter(pbmc@meta.data, cd8_coded == unique(pbmc@meta.data$cd8_coded)[i]))
    expvsnonexp[1, i] <- exp
    expvsnonexp[2, i] <- exprel
    expvsnonexp[3, i] <- nonexp
    expvsnonexp[4, i] <- nonexprel
    expvsnonexp[5, i] <- total            
}        
#order in increasing order according to expansion
expvsnonexp1 <- expvsnonexp[, order(expvsnonexp[2, ])]
expvsnonexp1
write.csv(expvsnonexp1, file = './expansion/cd8_expansion_coded.csv')

# Cell cycle scoring

In [None]:
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes
pbmc <- CellCycleScoring(pbmc, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
head(pbmc@meta.data)

In [None]:
DimPlot(pbmc, reduction = 'umap', group.by = 'Phase', label = TRUE, pt.size = 1.2)

# Adding diagnosis, treatment, date and CSF sample status information into the objects:

In [None]:
info <- read.csv(file = './Sample_info.csv', row.names = 1)

In [None]:
#for pbmc file:
for (i in 1:nrow(pbmc@meta.data)){
    pbmc@meta.data$diagnosis[i] <- filter(info, Sample == pbmc@meta.data$sample[i])$Diagnosis
    pbmc@meta.data$date[i] <- filter(info, Sample == pbmc@meta.data$sample[i])$Date
    pbmc@meta.data$treatment[i] <- filter(info, Sample == pbmc@meta.data$sample[i])$Treatment
    pbmc@meta.data$csf_status[i] <- filter(info, Sample == pbmc@meta.data$sample[i])$CSF_st
}

In [None]:
#create the diagnosis_simp column

pbmc@meta.data$diagnosis_simp <- pbmc@meta.data$diagnosis
for(i in 1:nrow(pbmc@meta.data)){
    if(pbmc@meta.data$diagnosis[i] == 'RRMS'){
        pbmc@meta.data$diagnosis_simp[i] <- 'MS'
    }
    if(pbmc@meta.data$diagnosis[i] == 'SPMS'){
        pbmc@meta.data$diagnosis_simp[i] <- 'MS'
    }
}
unique(pbmc@meta.data$diagnosis_simp)

In [None]:
saveRDS(pbmc, file = 'cd8_20210815.rds')
write.csv(pbmc@meta.data, file = 'md_cd8_20210815.csv')

# Searching for the matching clonotypes between PBMCs and CSF

In [None]:
#read in the table with the TCR and meta.data from the CSF of twins cohort

md <- read.table('./CSF_information.tsv')
i <- sapply(md, is.factor) # Identify all factor variables in your data
md[i] <- lapply(md[i], as.character) # Convert factors to character variables

In [None]:
md <- filter(md, index.sort == 'CD8Tcell')

In [None]:
#prepare the meta.data of pbmc for overlap analysis:
pbmc_md <- pbmc@meta.data
pbmc_md$samplenumb <- pbmc_md$sample
for(i in 1:nrow(pbmc_md)){
    if(pbmc_md$sample[i] == 'AU-MS-2'){
        pbmc_md$samplenumb[i] <- 'AU-MS'
    }
    if(pbmc_md$sample[i] == 'AU-H-2'){
        pbmc_md$samplenumb[i] <- 'AU-H'
    }
}

In [None]:
#subset only the samples with corresponding csf
subset_pbmc <- pbmc_md[pbmc_md$csf_status == 'yes', ]
unique(subset_pbmc$samplenumb)
unique(subset_pbmc$sample)

In [None]:
#get the mode function for determination of cluster within the CSF
getmode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}
#csf overlap meta.data = md, pbmc_overlap_meta.data = subset_pbmc. Search for overlaps:
md$barcodes <- rownames(md)
subset_pbmc$barcodes <- rownames(subset_pbmc)
md$TCR_Clono <- as.character(md$TCR_Clono)
overlaped_cells_pbmc <- c()
overlaped_pbmc_cluster_corresponding <- c()
overlaped_pbmc_TCR_clono_corresponding <- c() 
for(i in 1:length(unique(subset_pbmc$samplenumb))){
    sample_csf <- filter(md, samplenumb == unique(subset_pbmc$samplenumb)[i]) #create subset of the csf meta.data with the selected sample in loop
    sample_pbmc <- filter(subset_pbmc, samplenumb == unique(subset_pbmc$samplenumb)[i])  #create subset of the pbmc meta.data with the selected sample in loop    
    for(r in 1:nrow(sample_pbmc)){
        for(c in 1:4){
            if((sample_pbmc[r, paste('TCR', c, 'B', sep = '')] != 'FALSE') & (sample_pbmc[r, paste('TCR', c, 'B', sep = '')] %in% sample_csf$CDR3b)){ #search in TCRB of the corresponding sample
               barcodes <- sample_pbmc$barcodes[r]
               overlaped_cells_pbmc <- c(overlaped_cells_pbmc, barcodes)
               cluster <- getmode(filter(sample_csf, CDR3b == sample_pbmc[r, paste('TCR', c, 'B', sep = '')])$tSNE.sort)
               overlaped_pbmc_cluster_corresponding <- c(overlaped_pbmc_cluster_corresponding, cluster)
               tcr_clono <- getmode(filter(sample_csf, CDR3b == sample_pbmc[r, paste('TCR', c, 'B', sep = '')])$TCR_Clono)
               overlaped_pbmc_TCR_clono_corresponding <- c(overlaped_pbmc_TCR_clono_corresponding, tcr_clono)
            }
        }
    }
}


In [None]:
overlap_matrix_pbmc <- data.frame(matrix(NA, ncol = 3, nrow = length(overlaped_cells_pbmc)))
colnames(overlap_matrix_pbmc) <- c('barcodes', 'csf_cluster', 'csf_tcr_clono')

overlap_matrix_pbmc$barcodes <- overlaped_cells_pbmc
overlap_matrix_pbmc$csf_cluster <- overlaped_pbmc_cluster_corresponding
overlap_matrix_pbmc$csf_tcr_clono <- overlaped_pbmc_TCR_clono_corresponding

overlap_matrix_pbmc <- overlap_matrix_pbmc[!duplicated(overlap_matrix_pbmc$barcodes), ]
head(overlap_matrix_pbmc)
nrow(overlap_matrix_pbmc)

In [None]:
#subset seurat object
pbmc@meta.data$samplenumb <- pbmc_md$samplenumb
Idents(pbmc) <- 'csf_status'
subset_pbmc_object <- subset(pbmc, idents = 'yes')
subset_pbmc_object
unique(subset_pbmc_object@meta.data$samplenumb)

In [None]:
subset_pbmc_object@meta.data$overlap <- 'FALSE'
subset_pbmc_object@meta.data$csf_cluster <- 'FALSE'
subset_pbmc_object@meta.data$csf_tcr_clono <- 'FALSE'

#add the overlap information into the subset of pbmc file:
for(i in 1:nrow(subset_pbmc_object@meta.data)){
    if(rownames(subset_pbmc_object@meta.data)[i] %in% overlap_matrix_pbmc$barcodes){
        subset_pbmc_object@meta.data$overlap[i] <- 'overlap'
        subset_pbmc_object@meta.data$csf_cluster[i] <- filter(overlap_matrix_pbmc, barcodes == rownames(subset_pbmc_object@meta.data)[i])$csf_cluster
        subset_pbmc_object@meta.data$csf_tcr_clono[i] <- filter(overlap_matrix_pbmc, barcodes == rownames(subset_pbmc_object@meta.data)[i])$csf_tcr_clono
    }
}
head(subset_pbmc_object@meta.data)

In [None]:
overlapedcells <- rownames(filter(subset_pbmc_object@meta.data, overlap == 'overlap'))

In [None]:
Idents(subset_pbmc_object) <- 'cd8_coded'
levels(subset_pbmc_object) <- c('NK-like', 'MAIT', '1_CCR7', '2_NELL2', '3_NT5E', '4_CD82', '5_MAL', '6_GZMK', '7_MX1', '8_CD74', '9_IKZF2', '10_FGFBP2')

In [None]:
options(repr.plot.width = 15, repr.plot.height = 12)
DimPlot(subset_pbmc_object, reduction = "umap", label = TRUE, label.size = 9, repel = TRUE,  cells = overlapedcells, pt.size = 1.5) + 
theme(axis.line = element_line(size=1),
      text = element_text(size = 20),
      axis.text = element_text(size = 20),
      axis.ticks = element_line(size=1),
      legend.text=element_text(size=20))

plot <- DimPlot(subset_pbmc_object, reduction = "umap", label = TRUE, label.size = 9, pt.size = 7, raster = T, raster.dpi = c(2500, 2500), repel = TRUE, cells.highlight = overlapedcells, sizes.highlight = 16, cols.highlight = '#D3556E') + 
theme(
      text = element_text(size = 20),
      axis.text = element_text(size = 20),
      legend.text=element_text(size=20))
plot
ggsave(plot, file = paste0(dir_plots, 'overlap_cells_umap.pdf'), width = 15, height = 12)
options(repr.plot.width = 11, repr.plot.height = 11)

In [None]:
object <- subset_pbmc_object
object@meta.data$overlap[object@meta.data$overlap == 'FALSE'] <- 'non-overlapping'
object@meta.data$overlap[object@meta.data$overlap == 'overlap'] <- 'overlapping'

expansion <- c('non-overlapping', 'overlapping')
Idents(object) <- 'overlap'
levels(object) <- expansion
levels_subgroups <- levels(subset_pbmc_object)
object@meta.data$cluster_name <- object@meta.data$cd8_coded
object@meta.data$clusters <- object@meta.data$cluster_name

width <- 14
height <- 11
#explore the diagnosis of the cells
subgroups <- unique(object@meta.data$cluster_name)


results <- c()
for(i in 1:length(subgroups)){
    freq_subgroups <- data.frame(matrix(NA, ncol = length(levels_subgroups), nrow = length(expansion)))
    colnames(freq_subgroups) <- c('expand', 'absolute', 'relative', 'subgroups')
    freq_subgroups$expand <- expansion
    for(c in 1:nrow(freq_subgroups)){
        freq_subgroups$absolute[c] <- nrow(filter(object@meta.data, cluster_name == subgroups[i] & overlap == freq_subgroups$expand[c]))
        freq_subgroups$relative[c] <- 100 * nrow(filter(object@meta.data, cluster_name == subgroups[i] & overlap == freq_subgroups$expand[c])) /
                                    nrow(filter(object@meta.data, cluster_name == subgroups[i]))
    }
    freq_subgroups$subgroups <- subgroups[i]
    results <- rbind(results, freq_subgroups)
}
 results$subgroups <- factor(results$subgroups, levels = levels_subgroups)
results$clusters <- factor(results$expand, levels = expansion)
 write.csv(results, file = './outs/overlap_per_cluster_frequencies.csv')

for(i in 1:nrow(results)){
    if(results$expand[i] == 'non-overlapping'){
        results$relative[i] <- results$relative[i] - 75
    }
}

options(repr.plot.width=width, repr.plot.height=height)
plot <- ggplot(results, aes(fill=clusters, y=relative, x=subgroups)) +
        geom_bar(position="stack", stat="identity", ) + theme(
        plot.title = element_text(hjust = 0.45),
        text = element_text(size=30),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"),
        axis.text.x = element_text(angle = 55, vjust = 1, hjust=1, colour = 'black')) + ylab('Fraction of expanded cells within the cluster')+ xlab('Samples') +
        scale_fill_manual('legend', values = c('lightgrey', '#D3556E')) +
        ylim(0, 25.01)
        #scale_fill_viridis(discrete = TRUE)
        #scale_fill_brewer(palette = "Paired")
    print(plot)
    ggsave(plot, file = paste0('./outs/overlap_bar_plot.pdf'), width = width, height = height)

In [None]:
write.csv(subset_pbmc_object@meta.data, file = './overlaped_cells_md_20210815.csv')

In [None]:
saveRDS(subset_pbmc_object, file = './samples_with_csfoverlap_20210815.rds')