# Setup

In [None]:
quiet_library <- function(...) {
    suppressPackageStartupMessages(library(...))
}
quiet_library(Seurat)
quiet_library(tidyverse)
quiet_library(ggplot2)
quiet_library(Matrix)
quiet_library(H5weaver)
quiet_library(dplyr)
quiet_library(viridis)
quiet_library(harmony)
quiet_library(Nebulosa)
quiet_library(future)
quiet_library(future.apply)
quiet_library(dittoSeq)
quiet_library('ArchR')

In [None]:
# Check number of cores
future::availableCores()
# Set up parallel processing to run when using 'future' functions 
future::plan(strategy = "multicore", workers = future::availableCores()-3)  
options(future.globals.maxSize = 1000 * 1024^5)
# to turn off parallel processing run line below
# future::plan(strategy = "sequential")

In [None]:
# define file path
fig_path <- as.character('/home/jupyter/figures/preRA_teaseq')
data_path <- '/home/jupyter/data/preRA_teaseq/EXP-00243'
meta_path <- '/home/jupyter/data/preRA_teaseq/meta_data'
output_path <- '/home/jupyter/data/preRA_teaseq/output_results'
if(!dir.exists(fig_path)) (dir.create(fig_path, recursive = TRUE))
if(!dir.exists(output_path)) (dir.create(output_path, recursive = TRUE))
# define a project name
proj_name <- 'PreRA_teaseq'

In [None]:
# define the color palette to be used
npg_color <- c("#E64B35FF", "#4DBBD5FF", "#00A087FF", "#3C5488FF", "#F39B7FFF", 
               "#8491B4FF", "#91D1C2FF", "#DC0000FF", "#7E6148FF", "#B09C85FF")
nejm_color <- c("#BC3C29FF", "#0072B5FF", "#E18727FF", "#20854EFF", "#7876B1FF", "#6F99ADFF", "#FFDC91FF", "#EE4C97FF")
jama_color <- c("#374E55FF", "#DF8F44FF", "#00A1D5FF", "#B24745FF", "#79AF97FF", "#6A6599FF", "#80796BFF")
jco_color <- c("#0073C2FF", "#EFC000FF", "#868686FF", "#CD534CFF", "#7AA6DCFF", "#003C67FF", "#8F7700FF")
cluster_colors <- c("#DC050C", "#FB8072", "#1965B0", "#7BAFDE", "#882E72", "#B17BA6", "#FF7F00", "#FDB462", "#E7298A", 
    "#E78AC3", "#33A02C", "#B2DF8A", "#55A1B1", "#8DD3C7", "#A6761D", "#E6AB02", "#7570B3", "#BEAED4", "#666666", "#999999", 
    "#aa8282", "#d4b7b7", "#8600bf", "#ba5ce3", "#808000", "#aeae5c", "#1e90ff", "#00bfff", "#56ff0d", "#ffff00")

cluster_colors_ext <- colorRampPalette(cluster_colors)(36)
options(repr.plot.width = 20, repr.plot.height = 15)


In [None]:
source('/home/jupyter/github/Teaseq-analysis/scRNA_teaseq_ananlysis_helper_functions.r')

## Load data

In [None]:
# load the filetered myeloid data in
all_so <- readRDS(file.path(data_path,
                                'PreRA_teaseq_seurat_qc_filtered_cells_lsi.rds'))

In [None]:
all_so
all_so@meta.data %>% colnames()

In [None]:
all_so@meta.data$clean_l2_cell_types%>%table()

In [None]:
# export the total cell counts per sample
total_counts <- all_so@meta.data %>% group_by(pbmc_sample_id,subject_id) %>% 
    tally() %>% rename('n' = 'total_pbmc_counts')
total_counts
# total_counts %>% write_csv(file.path(output_path, 
#                                      paste0(proj_name, 
#                                             'PreRA_teaseq_seurat_qc_filtered_total_cell_counts.csv')))

In [None]:
options(repr.plot.width = 20, repr.plot.height = 15)
VlnPlot(all_so, features = c('nFeature_RNA','nCount_RNA','nCount_ADT','percent.mt'), log = F, 
        pt.size = 0, group.by = 'well_id', ncol = 2) & 
    stat_summary(fun=median, geom = "point", color="black")
ggsave(file.path(fig_path, paste0(proj_name, '_filtered_cells_qc_plots.pdf')))

In [None]:
all_so@meta.data %>% distinct(well_id)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 15)
p1 <- DimPlot(all_so, group.by = 'SCT_snn_res.0.5', label = T, 
              raster = TRUE, shuffle = TRUE, reduction = 'umap') 
p2 <- DimPlot(all_so, group.by = 'predicted.celltype.l2',
              raster = TRUE, shuffle = TRUE, label = T, reduction = 'umap') 
p3 <- DimPlot(all_so, group.by = 'well_id', 
              raster = TRUE, shuffle = TRUE, label = T, reduction = 'umap') 
p4 <- DimPlot(all_so, group.by = 'subject_id', raster = TRUE, shuffle = TRUE, label = T, reduction = 'umap')
p1 + p2 + p3 + p4
ggsave(file.path(fig_path, paste0(proj_name, '_filtered_cells_l2_seurat_label_rnaumap.pdf')), 
       width = 12, height = 8)

In [None]:
# output number of cells per samples
all_so@meta.data %>% group_by(subject_id) %>% tally() %>% 
    write_csv(file.path(output_path, paste0(proj_name, '_qc_filered_subject_cell_counts.csv')))

In [None]:
all_so@meta.data %>% distinct(predicted.celltype.l2)

In [None]:
all_so

## Recluster on RNA, ADT, ATAC

In [None]:
all_so@meta.data %>% colnames()

### reclustering on RNA

In [None]:
# redo sctransform to regress on precent.mt
DefaultAssay(all_so) <- "RNA"
all_so <- suppressWarnings(SCTransform(all_so, vars.to.regress = 'percent.mt')) %>% RunPCA()

In [None]:
# check elbow plot
ElbowPlot(all_so, ndims = 50, reduction = 'pca')

In [None]:
# run cluster in SCT
all_so <- RunUMAP(all_so, dims = 1:30, verbose = T) %>% 
    FindNeighbors(dims = 1:30, verbose = T) %>% 
    FindClusters(resolution = 0.5, verbose = T, future.seed = TRUE)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 10)
p1 <- DimPlot(all_so, label = T, reduction = 'umap',
              raster = TRUE, shuffle = TRUE,group.by = 'SCT_snn_res.0.5')

p2 <- DimPlot(all_so, group.by = 'predicted.celltype.l2', cols = cluster_colors, raster = TRUE, shuffle = TRUE,
              reduction = 'umap', label = T, repel = T) + NoLegend()
p3 <- DimPlot(all_so, group.by = 'subject_id',raster = TRUE, shuffle = TRUE,
              reduction = 'umap')
p4 <- DimPlot(all_so, group.by = 'cohort',raster = TRUE, shuffle = TRUE, reduction = 'umap')
p1+p2+p3+p4
ggsave(file.path(fig_path, paste0(proj_name, '_rna_umap.pdf')), width=12, height=8)

### reclustering on cleanadt

In [None]:
# create an assay for clean adt removing isotype controls
adt_mtx <- all_so@assays$ADT@counts
length(rownames(adt_mtx))
adts_to_remove <- rownames(adt_mtx) %>% str_subset('isotype|Isotype')
all(adts_to_remove %in% rownames(adt_mtx))

idx <- which(rownames(adt_mtx) %in% adts_to_remove)
clean_adt_mtx <- adt_mtx[-idx,]
length(rownames(clean_adt_mtx))
# add cleanadt to the seurat object
all_so[['cleanadt']] <- CreateAssayObject(clean_adt_mtx)
DefaultAssay(all_so) <- 'cleanadt'
rownames(all_so) %>% length()

In [None]:
# run some analysis on adt data
# ADT clustering only
DefaultAssay(all_so) <- 'cleanadt'
VariableFeatures(all_so) <- rownames(all_so[['cleanadt']])
all_so <- NormalizeData(all_so, normalization.method = 'CLR', margin = 2) %>% 
         ScaleData() %>% RunPCA(reduction.name = 'apca') 
ElbowPlot(all_so, ndims = 50, reduction = 'apca')

# all_so <- NormalizeData(all_so, normalization.method = "CLR", margin = 2)
# all_so <- adt_clustering_func(all_so, assay='cleanadt', resolution = 0.8)

In [None]:
# run umap and clustering in adt
all_so <- RunUMAP(all_so, dims = 1:25, reduction = 'apca', 
                  reduction.name = 'adt_umap', reduction.key='adtumap_') %>% 
        FindNeighbors(dims = 1:25, reduction = 'apca') %>% 
    FindClusters(resolution = 0.5, future.seed=TRUE)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 15)
p1 <- DimPlot(all_so, label = T, reduction = 'adt_umap', group.by = 'cleanadt_snn_res.0.5',
             raster = TRUE, shuffle = TRUE) + NoLegend()
p2 <- DimPlot(all_so, group.by = 'predicted.celltype.l2', cols = cluster_colors,raster = TRUE, shuffle = TRUE,
              reduction = 'adt_umap', label = T, repel = T)
p3 <- DimPlot(all_so, raster = TRUE, shuffle = TRUE,group.by = 'subject_id', reduction = 'adt_umap')
p4 <- DimPlot(all_so, raster = TRUE, shuffle = TRUE, group.by = 'cohort', reduction = 'adt_umap')
p1+p2+p3+p4
ggsave(file.path(fig_path, paste0(proj_name, '_all_cells_cleanadt_umap.pdf')), width=12, height=8)


### clustering on ATAC

In [None]:
# # load archR data
tea_atac <- loadArchRProject(path = "/home/jupyter/data/preRA_teaseq/EXP-00243/atac_arrows")
# # rerun lsi in the subset cells and load lsi to seurat
# all_so <- ExtractLSI(all_so, tea_atac)

In [None]:
# clustering on ATAC
DefaultAssay(all_so) <- 'Tiles'
all_so <- RunUMAP(all_so, dims = 1:30, reduction = 'lsit', reduction.name = 'atac_umap', reduction.key='atacumap_') %>% 
        FindNeighbors(dims = 1:30, reduction = 'lsit') %>% FindClusters(resolution = 0.8, future.seed=TRUE)

In [None]:
all_so@meta.data %>% colnames()
all_so

In [None]:
options(repr.plot.width = 20, repr.plot.height = 15)
p1 <- DimPlot(all_so, label = T, reduction = 'atac_umap',raster = TRUE, shuffle = TRUE,
              group.by = 'Tiles_snn_res.0.8') + NoLegend()
p2 <- DimPlot(all_so, group.by = 'predicted.celltype.l2',cols = cluster_colors,raster = TRUE, shuffle = TRUE,
              reduction = 'atac_umap', label = T, repel = T) 
p3 <- DimPlot(all_so, group.by = 'subject_id',raster = TRUE, shuffle = TRUE, reduction = 'atac_umap')
p4 <- DimPlot(all_so, group.by = 'cohort',raster = TRUE, shuffle = TRUE, cols = nejm_color , reduction = 'atac_umap')
p1+p2+p3+p4
ggsave(file.path(fig_path, paste0(proj_name, '_all_cells_atac_umap.pdf')), width=12, height=8)

In [None]:
plot_cluster_freq(all_so, cluster.name = 'Tiles_snn_res.0.8', color.by = 'cohort')
# plot_cluster_freq(all_so, cluster.name = 'Tiles_snn_res.0.8', color.by = 'cohort', figname='subtypeA')
# plot_cluster_counts(all_so, cluster.name = 'predicted.MonocyteSubsets', color.by = 'cohort', figname='subtypeA')

In [None]:
all_so@meta.data %>% colnames() %>% sort()

### 3way wnn clustering

In [None]:
# do 3way wnn clustering
wnn_3way_clustering_func <- function(x, dim_list= list(1:30, 1:20, 1:30), resolution = 1){
    DefaultAssay(x) <- 'SCT'
  x <- FindMultiModalNeighbors(
    x, reduction.list = list("pca", "apca", "lsit"),
      k.nn = 20, knn.range = 100, prune.SNN = 1/20,
    dims.list = dim_list,
      modality.weight.name = c('SCT.weight', 'ADT.weight', 'Tiles.weight')
  )
  x <- RunUMAP(x, nn.name = "weighted.nn", reduction.name = "wnn.3.umap", reduction.key = "Uw3_")
  x <- FindClusters(x, graph.name = "wsnn", algorithm = 3, resolution = resolution, verbose = TRUE, 
                    future.seed=TRUE)
  x
}

In [None]:
# 3way wnn clustering
all_so <- wnn_3way_clustering_func(all_so, dim_list= list(1:30, 1:25, 1:30),  resolution = 0.5)

In [None]:
# run the different resolutions
all_so <- FindClusters(all_so, graph.name = "wsnn", future.seed=TRUE,
                          algorithm = 3, resolution = 0.8, verbose = TRUE)

In [None]:
# plot 3wnn umap
p1 <- DimPlot(all_so, label = T, reduction = 'wnn.3.umap',
              raster = TRUE, shuffle = TRUE, group.by = 'wsnn_res.0.5') + NoLegend()
p2 <- DimPlot(all_so, group.by = 'predicted.celltype.l2',raster = TRUE, shuffle = TRUE,
              cols = cluster_colors, reduction = 'wnn.3.umap', label = T) 
p3 <- DimPlot(all_so, group.by = 'cohort',raster = TRUE, shuffle = TRUE,
              reduction = 'wnn.3.umap', label = T) + NoLegend()
p4 <- DimPlot(all_so, group.by = 'subject_id',raster = TRUE, shuffle = TRUE, reduction = 'wnn.3.umap')
p1 + p2 + p3 + p4
ggsave(file.path(fig_path, paste0(proj_name, '_wsnn_res.0.5_3wnnumap.pdf')), width=12, height=8)

In [None]:
plot_cluster_counts(all_so, 'wsnn_res.0.5', color.by='cohort', figname = '')
plot_cluster_freq(all_so, 'wsnn_res.0.5', color.by='cohort', figname = '')

In [None]:
# plot the boxplot of the clusters
p1 <- dittoSeq::dittoFreqPlot(all_so, "wsnn_res.0.5",
     sample.by = "subject_id", group.by = "cohort", color.by = "cohort",
    split.adjust = list(scales = "free"))

# plot the boxplot of the clusters
p2 <- dittoSeq::dittoFreqPlot(all_so, "predicted.celltype.l2",
     sample.by = "subject_id", group.by = "cohort", color.by = "cohort",
    split.adjust = list(scales = "free"))
p2

In [None]:
# save the seurat obeject 
all_so %>% saveRDS(file.path(data_path, 'PreRA_teaseq_seurat_qc_filtered_cells_lsi.rds'))

In [None]:
# plot frequency of the clusters
CalClusterFreq <- function(seurat_data, cluster.name, group.by='subject_id', 
                              color.by){
    # plot the frequency by sample for l1 adt clustering
    cluster_counts <- seurat_data@meta.data %>% as_tibble() %>% 
        group_by(.data[[group.by]], .data[[cluster.name]], .data[[color.by]]) %>% 
        summarise(counts=n(), !!group.by:=.data[[group.by]], !!color.by:=.data[[color.by]],
                  .groups = 'drop') %>% 
        distinct()%>% group_by(.data[[group.by]]) %>% 
        mutate(frequency=counts/sum(counts), total_counts=sum(counts))
    return(cluster_counts)
}

# PlotClusterBox <- function(freq_table)
ped_sen_gating_celltype_counts <- all_so %>%
    CalClusterFreq(cluster.name='predicted.celltype.l2', group.by = 'subject_id', color.by='cohort')  
p1 <- ped_sen_gating_celltype_counts %>%
    ggplot(aes(x=cohort, y=counts, fill=cohort))+ 
    geom_boxplot(outlier.shape = NA) + 
    geom_point(position=position_jitterdodge(), size=0.8, alpha=0.8)+
    facet_wrap(vars(predicted.celltype.l2), scales = 'free_y') +
    theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1),
             text = element_text(size = 12))+
    scale_fill_manual(values =  nejm_color)
p1
ggsave(file.path(fig_path, paste0(proj_name, '_celltype_cohort_count_boxplot.pdf')), 
       width=12, height=8)
ped_sen_l2_celltype_counts <- all_so %>%
    CalClusterFreq(cluster.name='predicted.celltype.l2', group.by = 'subject_id', color.by='cohort')  
p2 <- ped_sen_gating_celltype_counts %>%
    ggplot(aes(x=cohort, y=frequency, fill=cohort))+ 
    geom_boxplot(outlier.shape = NA) + 
    geom_point(position=position_jitterdodge(), size=0.8, alpha=0.8)+
    facet_wrap(vars(predicted.celltype.l2), scales = 'free_y') +
    theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1),
             text = element_text(size = 12)) +
    scale_fill_manual(values =  nejm_color)
p2
ggsave(file.path(fig_path, paste0(proj_name, '_celltype_cohort_frequency_boxplot.pdf')), 
       width=12, height=8)

In [None]:
dittoSeq::dittoFreqPlot(all_so, "predicted.celltype.l2",
     sample.by = "subject_id", group.by = "cohort", color.by = "cohort",
    split.adjust = list(scales = "free"))

## subset major cell types by 3wnn

In [None]:
# plot 3wnn umap
p1 <- DimPlot(all_so, label = T, reduction = 'wnn.3.umap',raster = TRUE, shuffle = TRUE,
              group.by = 'wsnn_res.0.5') + NoLegend()
p2 <- DimPlot(all_so, group.by = 'predicted.celltype.l2', cols = cluster_colors,raster = TRUE, shuffle = TRUE,
              reduction = 'wnn.3.umap', label = T) 
p3 <- DimPlot(all_so, group.by = 'cohort', cols = nejm_color,
              raster = TRUE, shuffle = TRUE, reduction = 'wnn.3.umap', label = T) + NoLegend()
p4 <- DimPlot(all_so, group.by = 'subject_id',raster = TRUE, shuffle = TRUE, reduction = 'wnn.3.umap')
p1 + p2 + p3 + p4
ggsave(file.path(fig_path, paste0(proj_name, '_l2_labels_wsnn_res.0.5_3wnnumap.pdf')), width=12, height=8)

In [None]:
# plot 3wnn umap
p1 <- DimPlot(all_so, label = T, reduction = 'umap', raster = TRUE, shuffle = TRUE, cols = cluster_colors,
              group.by = 'wsnn_res.0.5') + NoLegend()
p2 <- DimPlot(all_so, group.by = 'predicted.celltype.l2', cols = cluster_colors,raster = TRUE, shuffle = TRUE,
              reduction = 'wnn.3.umap', label = T) 
p3 <- DimPlot(all_so, group.by = 'cohort', cols = nejm_color,
              raster = TRUE, shuffle = TRUE, reduction = 'wnn.3.umap', label = T) + NoLegend()
p4 <- DimPlot(all_so, group.by = 'subject_id',raster = TRUE, shuffle = TRUE, reduction = 'wnn.3.umap')
p1 + p2 + p3 + p4
ggsave(file.path(fig_path, paste0(proj_name, '_l2_labels_wsnn_res.0.5_3wnnumap.pdf')), width=12, height=8)

In [None]:
dittoBarPlot(all_so, "predicted.celltype.l1", group.by = "wsnn_res.0.5")
ggsave(file.path(fig_path, paste0(proj_name, '_l2_labels_wsnn_res.0.5_barplot.pdf')), width=6, height=6)

In [None]:
# define a list of the backbone adt markers to define the general cell types
DefaultAssay(all_so) <- 'cleanadt'
adt_markers <- c('CD45', 'CD3', 'CD4', 'CD8', 'CD56', 'CD19', 'CD14', "HLA.DR", 'CD11b', 'CD16')
all(adt_markers %in% rownames(all_so))
rownames(all_so) %>% sort()

In [None]:
all_so@meta.data %>% group_by(subject_id) %>% tally()

In [None]:
p1 <- FeaturePlot(all_so, features =  'sct_TNFSF11', raster = FALSE,
                     #min.cutoff = "q1", max.cutoff = "q99",
            reduction = 'wnn.3.umap')
p1

In [None]:
# test imputed gene scores of certain
markerGenes <- c('TNFSF11')

p <- plotEmbedding(
    ArchRProj = tea_atac, 
    colorBy = "GeneScoreMatrix", 
    name = markerGenes, 
    embedding = "UMAP",
    quantCut = c(0.01, 0.95),
    imputeWeights = NULL
)
p

In [None]:
p1 <- FeaturePlot(all_so, features =  adt_markers, raster = TRUE,
                     min.cutoff = "q1", max.cutoff = "q99",
            reduction = 'wnn.3.umap')
p1
ggsave(file.path(fig_path, paste0(proj_name, '_l1_adts_3wnnumap_features.pdf')), width=12, height=8)

p2 <- plot_density(all_so, features =  adt_markers, 
            reduction = 'wnn.3.umap')
p2
ggsave(file.path(fig_path, paste0(proj_name, '_l1_adt_3wnnumap_density.pdf')), width=12, height=8)

In [None]:
p1 <- DimPlot(all_so, label = T, reduction = 'wnn.3.umap',
         group.by = 'wsnn_res.0.5', cols = cluster_colors, raster = FALSE) 
p1
# p2 <- DimPlot(all_so, group.by = 'predicted.celltype.l2', cols = cluster_colors,
#               raster = FALSE, repel = TRUE,
#               reduction = 'wnn.3.umap', label = T) 
# p1+p2
ggsave(file.path(fig_path, paste0(proj_name, '_wsnn_res.0.5_3wnnumap.pdf')), width=12, height=8)

In [None]:
# take a look of cluster 16
c16_cellids <- all_so@meta.data %>%filter(wsnn_res.0.5==16) %>%rownames()
p1 <- DimPlot(all_so, label = T, reduction = 'wnn.3.umap', cells.highlight = c16_cellids,
         group.by = 'wsnn_res.0.5', raster = FALSE) 
p1
# p2 <- DimPlot(all_so, group.by = 'predicted.celxltype.l2', cols = cluster_colors,
#               raster = FALSE, repel = TRUE,
#               reduction = 'wnn.3.umap', label = T) 
# p1+p2
ggsave(file.path(fig_path, paste0(proj_name, '_wsnn_res.0.5_3wnnumap_c16.pdf')), width=12, height=8)

In [None]:
# check what are the deas and degs for cluster 16 vs all other lymphocytes
c16_deas <- FindMarkers(all_so, ident.1 = 16, ident.2 = c(0, 1, 2, 5, 6, 9, 10, 11, 12, 13, 3, 7, 17, 18),
                        assay = 'cleanadt')
c16_degs <- FindMarkers(all_so, ident.1 = 16,ident.2 = c(0, 1, 2, 5, 6, 9, 10, 11, 12, 13, 3, 7, 17, 18),
                        assay = 'RNA')


In [None]:
c16_deas %>% head()
c16_degs %>% head()

In [None]:
# assign the major cell type based on the wsnn_res.0.5 clusters
all_so@meta.data <- all_so@meta.data %>% 
    mutate(l1_cell_types = case_when(wsnn_res.0.5 %in% c(3, 7, 17, 18) ~ 'B cells',
                                    wsnn_res.0.5 %in% c(8, 15, 20) ~ 'Myeloid cells',
                                    wsnn_res.0.5 %in% c(4, 14) | predicted.celltype.l2 == 'NK Proliferating' ~ 'NK cells',
                                    wsnn_res.0.5 %in% c(19) ~ 'HSPC',
                                    wsnn_res.0.5 %in% c(0, 1, 2, 5, 6, 9, 10, 11, 12, 13, 16) ~ 'T cells'))

In [None]:
all_so@meta.data %>% distinct(wsnn_res.0.5, l1_cell_types)

In [None]:
dittoBarPlot(all_so, "predicted.celltype.l1", group.by = "l1_cell_types")
ggsave(file.path(fig_path, paste0(proj_name, '_predicted_l1_wsnn_res.0.5_l1_labels_barplot.pdf')), width=6, height=6)

In [None]:
adt_markers <- c('CD45', 'CD3', 'CD4', 'CD8', 'CD56', 'CD19', 'CD14', "HLA.DR", 'CD11b', 'CD16')
# plot classical adts for cell type definition in l1 labels
options(repr.plot.width = 20, repr.plot.height = 15)
VlnPlot(all_so, features =adt_markers, group.by = 'l1_cell_types',
        pt.size = 0, ncol = 4)& 
    stat_summary(fun=median, geom = "point", color="black") 
ggsave(file.path(fig_path, paste0(proj_name, 'all_cells_violin_adts.pdf')), width=12, height=8)

In [None]:
# subet the 
# save the seurat obeject 
all_so %>% saveRDS(file.path(data_path, 'PreRA_teaseq_seurat_qc_filtered_cells_lsi.rds'))

In [None]:
# output the cell number for all major subsets
cell_counts <- all_so@meta.data %>% group_by(l1_cell_types, subject_id) %>% tally()
cell_counts %>% write_csv(file.path(output_path, 'PreRA_teaseq_l1_cell_types_counts.csv'))

## subset B cells and do some intial analysis

In [None]:
proj_name <- 'PreRA_teaseq_Bcells'
fig_path <- '/home/jupyter/figures/preRA_teaseq/B_cells'
if(!dir.exists(fig_path)) (dir.create(fig_path))

In [None]:
# subset B cells from the PBMC obeject
ra_tea_b <- subset(all_so, l1_cell_types=='B cells')
ra_tea_b

In [None]:
ra_tea_b@meta.data %>% colnames()
kit_id <- ra_tea_b@meta.data %>% distinct(pbmc_sample_id) %>% pull(pbmc_sample_id) %>%
    str_replace('PB', 'KT') %>% str_split("-", simplify = TRUE)
kit_id[,1]

In [None]:
ra_tea_b@meta.data %>% distinct(pbmc_sample_id, subject_id) %>% arrange(subject_id)

### reclustering on RNA

In [None]:
# redo sctransform to regress on precent.mt
DefaultAssay(ra_tea_b) <- "RNA"
ra_tea_b <- suppressWarnings(SCTransform(ra_tea_b, vars.to.regress = 'percent.mt')) %>% RunPCA()

In [None]:
# check elbow plot
ElbowPlot(ra_tea_b, ndims = 50, reduction = 'pca')

In [None]:
# run cluster in SCT
ra_tea_b <- RunUMAP(ra_tea_b, dims = 1:30, verbose = T) %>% 
    FindNeighbors(dims = 1:30, verbose = T) %>% 
    FindClusters(resolution = 0.5, verbose = T, future.seed = TRUE)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 10)
p1 <- DimPlot(ra_tea_b, label = T, reduction = 'umap', group.by = 'SCT_snn_res.0.5') 
p2 <- DimPlot(ra_tea_b, group.by = 'predicted.celltype.l2', cols = cluster_colors,
              reduction = 'umap', label = T, repel = T) + NoLegend()
p3 <- DimPlot(ra_tea_b, group.by = 'subject_id', reduction = 'umap')
p4 <- DimPlot(ra_tea_b, group.by = 'cohort', cols = nejm_color, reduction = 'umap')
p1+p2+p3+p4
ggsave(file.path(fig_path, paste0(proj_name, '_rna_umap.pdf')), width=12, height=8)

### reclustering on cleanadt

In [None]:
# run some analysis on adt data
# ADT clustering only
DefaultAssay(ra_tea_b) <- 'cleanadt'
ra_tea_b <- NormalizeData(ra_tea_b, normalization.method = 'CLR', margin = 2) %>% 
         ScaleData() %>% RunPCA(reduction.name = 'apca') 
ElbowPlot(ra_tea_b, ndims = 50, reduction = 'apca')

# all_so <- NormalizeData(all_so, normalization.method = "CLR", margin = 2)
# all_so <- adt_clustering_func(all_so, assay='cleanadt', resolution = 0.8)

In [None]:
# run umap and clustering in adt
ra_tea_b <- RunUMAP(ra_tea_b, dims = 1:20, reduction = 'apca', 
                  reduction.name = 'adt_umap', reduction.key='adtumap_') %>% 
        FindNeighbors(dims = 1:20, reduction = 'apca') %>% 
    FindClusters(resolution = 0.5, future.seed=TRUE)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 15)
p1 <- DimPlot(ra_tea_b, label = T, reduction = 'adt_umap', group.by = 'cleanadt_snn_res.0.5') + NoLegend()
p2 <- DimPlot(ra_tea_b, group.by = 'predicted.celltype.l2', cols = cluster_colors, reduction = 'adt_umap', label = T, repel = T)
p3 <- DimPlot(ra_tea_b, group.by = 'subject_id', reduction = 'adt_umap')
p4 <- DimPlot(ra_tea_b, group.by = 'cohort',cols = nejm_color, reduction = 'adt_umap')
p1+p2+p3+p4
ggsave(file.path(fig_path, paste0(proj_name, '_cleanadt_umap.pdf')), width=12, height=8)


### clustering on ATAC

In [None]:
# # load archR data
tea_atac <- loadArchRProject(path = "/home/jupyter/data/preRA_teaseq/EXP-00243/atac_arrows")
# rerun lsi in the subset cells and load lsi to seurat
ra_tea_b <- ExtractLSI(ra_tea_b, tea_atac, save.archR = TRUE, 
                       dropCells = TRUE, archR.dir = '/home/jupyter/data/preRA_teaseq/EXP-00243/B_cells')

In [None]:
# clustering on ATAC
DefaultAssay(ra_tea_b) <- 'Tiles'
ra_tea_b <- RunUMAP(ra_tea_b, dims = 1:30, reduction = 'lsit', 
                    reduction.name = 'atac_umap', reduction.key='atacumap_') %>% 
        FindNeighbors(dims = 1:30, reduction = 'lsit') %>% FindClusters(resolution = 0.8, future.seed=TRUE)

In [None]:
ra_tea_b@meta.data %>% colnames()
ra_tea_b

In [None]:
options(repr.plot.width = 20, repr.plot.height = 15)
p1 <- DimPlot(ra_tea_b, label = T, reduction = 'atac_umap', group.by = 'Tiles_snn_res.0.8') + NoLegend()
p2 <- DimPlot(ra_tea_b, group.by = 'predicted.celltype.l2',cols = cluster_colors,
              reduction = 'atac_umap', label = T, repel = T) 
p3 <- DimPlot(ra_tea_b, group.by = 'subject_id', reduction = 'atac_umap')
p4 <- DimPlot(ra_tea_b, group.by = 'cohort', cols = nejm_color , reduction = 'atac_umap')
p1+p2+p3+p4
ggsave(file.path(fig_path, paste0(proj_name, '_all_cells_atac_umap.pdf')), width=12, height=8)

In [None]:
# plot_cluster_freq(ra_tea_b, cluster.name = 'Tiles_snn_res.0.8', color.by = 'cohort')
# plot_cluster_freq(all_so, cluster.name = 'Tiles_snn_res.0.8', color.by = 'cohort', figname='subtypeA')
# plot_cluster_counts(all_so, cluster.name = 'predicted.MonocyteSubsets', color.by = 'cohort', figname='subtypeA')

In [None]:
ra_tea_b@meta.data %>% colnames() %>% sort()

### 3way wnn clustering

In [None]:
# # do 3way wnn clustering
# wnn_3way_clustering_func <- function(x, dim_list= list(1:30, 1:20, 1:30), resolution = 1){
#     DefaultAssay(x) <- 'SCT'
#   x <- FindMultiModalNeighbors(
#     x, reduction.list = list("pca", "apca", "lsit"),
#       k.nn = 20, knn.range = 100, prune.SNN = 1/20,
#     dims.list = dim_list,
#       modality.weight.name = c('SCT.weight', 'ADT.weight', 'Tiles.weight')
#   )
#   x <- RunUMAP(x, nn.name = "weighted.nn", reduction.name = "wnn.3.umap", reduction.key = "Uw3_")
#   x <- FindClusters(x, graph.name = "wsnn", algorithm = 3, resolution = resolution, verbose = TRUE, 
#                     future.seed=TRUE)
#   x
# }

In [None]:
# 3way wnn clustering
ra_tea_b <- wnn_3way_clustering_func(ra_tea_b, dim_list= list(1:30, 1:20, 1:30),  resolution = 0.5)

In [None]:
# run the different resolutions
for (res in seq(0.1, 1, 0.1)){
    ra_tea_b <- FindClusters(ra_tea_b, graph.name = "wsnn", future.seed=TRUE,
                              algorithm = 3, resolution = res, verbose = TRUE)    
}


In [None]:
seq(0.2, 1, 0.1)

In [None]:
# plot the cluster tree to check what resoluation are appropriate 
library(clustree)
# ra_tea_b@meta.data %>% colnames() %>% str_starts('wsnn_')
clustree(ra_tea_b, prefix = "wsnn_res.")
ggsave(file.path(fig_path, paste0(proj_name, '_3wnn_cluster_tree.pdf')), width=12, height=8)

In [None]:
# plot 3wnn umap
p1 <- DimPlot(ra_tea_b, label = T, reduction = 'wnn.3.umap',  group.by = 'wsnn_res.0.5') + NoLegend()
p2 <- DimPlot(ra_tea_b, group.by = 'predicted.celltype.l2',cols = cluster_colors, reduction = 'wnn.3.umap', label = T) 
p3 <- DimPlot(ra_tea_b, group.by = 'cohort', cols = nejm_color , reduction = 'wnn.3.umap', label = T)
p4 <- DimPlot(ra_tea_b, group.by = 'subject_id', reduction = 'wnn.3.umap')
p1 + p2 + p3 + p4
ggsave(file.path(fig_path, paste0(proj_name, '_wsnn_res.0.5_3wnnumap.pdf')), width=12, height=8)

In [None]:
# plot_cluster_counts(all_so, 'wsnn_res.0.5', color.by='cohort', figname = '')
plot_cluster_freq(ra_tea_b, 'wsnn_res.0.5', color.by='cohort', figname = '')
plot_cluster_counts(ra_tea_b, 'wsnn_res.0.5', color.by='cohort', figname = '')

In [None]:
# save the seurat obeject 
ra_tea_b %>% saveRDS(file.path(data_path, 'PreRA_teaseq_seurat_b_cells.rds'))

In [None]:
DefaultAssay(ra_tea_b) <- 'SCT'
rownames(ra_tea_b) %>% str_subset('AIM2|IGHG|IGHA')

### labeling B cell clusters

In [None]:
# load the seurat obeject 
ra_tea_b <- readRDS(file.path(data_path, 'PreRA_teaseq_seurat_b_cells.rds'))

In [None]:
# plot the cannanical b cell protein markers
DefaultAssay(ra_tea_b) <- 'cleanadt'
rownames(ra_tea_b) %>% sort()
b_adts <- c('CD19', 'CD20', 'CD21', 'CD23', 'CD25', 'CD95', 'CD40', 'CD38', 'CD24', 'CD9', 'CD69', 'TIGIT',
            'CD80', 'CD22', 'CD73', 'CD279',
            'CD27', 'CD10', 'CD319', 'HLA.DR-DP-DQ','HLA.DR',
            'CD11c','IgD', 'IgE', 'IgM', 'Ig-light-chain-k', 'Ig-light-chain-l') %>% sort()
b_genes <- c('sct_TBX21', 'sct_ITGAX', 'sct_PDCD1', 'sct_PRDM1','sct_TNFRSF17', 'sct_ERO1B', 
             'sct_STAT3', 'sct_IKZF3', 'sct_AIM2', 'sct_IGHA2', 'sct_IGHG4', 'sct_IGHG2',
             'sct_IGHGP', 'sct_IGHA1', 'sct_IGHG1', 'sct_IGHG3') %>% sort()

In [None]:
# plot adts
DefaultAssay(ra_tea_b) <- 'cleanadt'
p1 <- FeaturePlot(ra_tea_b, features = b_adts, raster = FALSE, 
                     min.cutoff = "q1", max.cutoff = "q99",
            reduction = 'wnn.3.umap')
p1
ggsave(file.path(fig_path, paste0(proj_name, '_adts_3wnnumap.pdf')), width=24, height=18)
p2 <- plot_density(ra_tea_b, b_adts, reduction = 'wnn.3.umap')
p2
ggsave(file.path(fig_path, paste0(proj_name, '_adts_density_3wnnumap.pdf')), width=24, height=18)
p3 <- VlnPlot(ra_tea_b, features= b_adts, group.by = 'wsnn_res.0.5',
        pt.size = 0, ncol = 4)& 
    stat_summary(fun=median, geom = "point", color="black") 
p3
ggsave(file.path(fig_path, paste0(proj_name, '_adts_violin_wsnn_res.0.5.pdf')), width=24, height=18)

In [None]:
# plot gene expressions
DefaultAssay(ra_tea_b) <- 'SCT'
p1 <- FeaturePlot(ra_tea_b, features = b_genes, raster = FALSE, 
                     min.cutoff = "q1", max.cutoff = "q99",
            reduction = 'wnn.3.umap')
p1
ggsave(file.path(fig_path, paste0(proj_name, '_genes_3wnnumap.pdf')), width=24, height=18)
p2 <- plot_density(ra_tea_b, b_genes, reduction = 'wnn.3.umap')
p2
ggsave(file.path(fig_path, paste0(proj_name, '_genes_density_3wnnumap.pdf')), width=24, height=18)
p3 <- VlnPlot(ra_tea_b, features= b_genes, group.by = 'wsnn_res.0.5',
        pt.size = 0, ncol = 4)& 
    stat_summary(fun=median, geom = "point", color="black") 
p3
ggsave(file.path(fig_path, paste0(proj_name, '_genes_wsnn_res.0.5_violin.pdf')), width=24, height=18)

In [None]:
DotPlot(ra_tea_b, assay='cleanadt', features = b_adts, group.by='wsnn_res.0.5')+ RotatedAxis()
ggsave(file.path(fig_path, paste0(proj_name, '_adts_dotplot_wsnn_res.0.5.pdf')), width=12, height=8)
DotPlot(ra_tea_b, assay='SCT', features = b_genes, group.by='wsnn_res.0.5')+ RotatedAxis()
ggsave(file.path(fig_path, paste0(proj_name, '_genes_dotplot_wsnn_res.0.5.pdf')), width=12, height=8)

In [None]:
# plot the boxplot of the clusters
dittoSeq::dittoFreqPlot(ra_tea_b, "wsnn_res.0.4",
     sample.by = "subject_id", group.by = "cohort", color.by = "cohort",
    split.adjust = list(scales = "free"))

## subset T cells and do some intial analysis

In [None]:
proj_name <- 'PreRA_teaseq_Tcells'
fig_path <- '/home/jupyter/figures/preRA_teaseq/t_cells'
if(!dir.exists(fig_path)) (dir.create(fig_path))

In [None]:
# subset T cells from the PBMC obeject
ra_tea_t <- subset(all_so, l1_cell_types=='T cells')
ra_tea_t

### reclustering on RNA

In [None]:
# redo sctransform to regress on precent.mt
DefaultAssay(ra_tea_t) <- "RNA"
ra_tea_t <- suppressWarnings(SCTransform(ra_tea_t, vars.to.regress = 'percent.mt')) %>% RunPCA()

In [None]:
# check elbow plot
ElbowPlot(ra_tea_t, ndims = 50, reduction = 'pca')

In [None]:
# run cluster in SCT
ra_tea_t <- RunUMAP(ra_tea_t, dims = 1:30, verbose = T) %>% 
    FindNeighbors(dims = 1:30, verbose = T) %>% 
    FindClusters(resolution = 0.5, verbose = T, future.seed = TRUE)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 10)
p1 <- DimPlot(ra_tea_t, label = T, reduction = 'umap', group.by = 'SCT_snn_res.0.5') 
p2 <- DimPlot(ra_tea_t, group.by = 'predicted.celltype.l2', cols = cluster_colors,
              reduction = 'umap', label = T, repel = T) + NoLegend()
p3 <- DimPlot(ra_tea_t, group.by = 'subject_id', reduction = 'umap')
p4 <- DimPlot(ra_tea_t, group.by = 'cohort', cols = nejm_color, reduction = 'umap')
p1+p2+p3+p4
ggsave(file.path(fig_path, paste0(proj_name, '_rna_umap.pdf')), width=12, height=8)

### reclustering on cleanadt

In [None]:
# run some analysis on adt data
# ADT clustering only
DefaultAssay(ra_tea_t) <- 'cleanadt'
ra_tea_t <- NormalizeData(ra_tea_t, normalization.method = 'CLR', margin = 2) %>% 
         ScaleData() %>% RunPCA(reduction.name = 'apca') 
ElbowPlot(ra_tea_t, ndims = 50, reduction = 'apca')

# all_so <- NormalizeData(all_so, normalization.method = "CLR", margin = 2)
# all_so <- adt_clustering_func(all_so, assay='cleanadt', resolution = 0.8)

In [None]:
# run umap and clustering in adt
ra_tea_t <- RunUMAP(ra_tea_t, dims = 1:20, reduction = 'apca', 
                  reduction.name = 'adt_umap', reduction.key='adtumap_') %>% 
        FindNeighbors(dims = 1:20, reduction = 'apca') %>% 
    FindClusters(resolution = 0.5, future.seed=TRUE)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 15)
p1 <- DimPlot(ra_tea_t, label = T, reduction = 'adt_umap', group.by = 'cleanadt_snn_res.0.5') + NoLegend()
p2 <- DimPlot(ra_tea_t, group.by = 'predicted.celltype.l2', cols = cluster_colors, reduction = 'adt_umap', label = T, repel = T)
p3 <- DimPlot(ra_tea_t, group.by = 'subject_id', reduction = 'adt_umap')
p4 <- DimPlot(ra_tea_t, group.by = 'cohort',cols = nejm_color, reduction = 'adt_umap')
p1+p2+p3+p4
ggsave(file.path(fig_path, paste0(proj_name, '_cleanadt_umap.pdf')), width=12, height=8)


### clustering on ATAC

In [None]:
# # load archR data
tea_atac <- loadArchRProject(path = "/home/jupyter/data/preRA_teaseq/EXP-00243/atac_arrows")
# rerun lsi in the subset cells and load lsi to seurat
ra_tea_t <- ExtractLSI(ra_tea_t, tea_atac, save.archR = TRUE, 
                       dropCells = TRUE, archR.dir = '/home/jupyter/data/preRA_teaseq/EXP-00243/T_cells')

In [None]:
# clustering on ATAC
DefaultAssay(ra_tea_t) <- 'Tiles'
ra_tea_t <- RunUMAP(ra_tea_t, dims = 1:30, reduction = 'lsit', 
                    reduction.name = 'atac_umap', reduction.key='atacumap_') %>% 
        FindNeighbors(dims = 1:30, reduction = 'lsit') %>% FindClusters(resolution = 0.8, future.seed=TRUE)

In [None]:
ra_tea_t@meta.data %>% colnames()
ra_tea_t

In [None]:
options(repr.plot.width = 20, repr.plot.height = 15)
p1 <- DimPlot(ra_tea_t, label = T, reduction = 'atac_umap', group.by = 'Tiles_snn_res.0.8') + NoLegend()
p2 <- DimPlot(ra_tea_t, group.by = 'predicted.celltype.l2',cols = cluster_colors,
              reduction = 'atac_umap', label = T, repel = T) 
p3 <- DimPlot(ra_tea_t, group.by = 'subject_id', reduction = 'atac_umap')
p4 <- DimPlot(ra_tea_t, group.by = 'cohort', cols = nejm_color , reduction = 'atac_umap')
p1+p2+p3+p4
ggsave(file.path(fig_path, paste0(proj_name, '_all_cells_atac_umap.pdf')), width=12, height=8)

In [None]:
# plot_cluster_freq(ra_tea_t, cluster.name = 'Tiles_snn_res.0.8', color.by = 'cohort')
# plot_cluster_freq(all_so, cluster.name = 'Tiles_snn_res.0.8', color.by = 'cohort', figname='subtypeA')
# plot_cluster_counts(all_so, cluster.name = 'predicted.MonocyteSubsets', color.by = 'cohort', figname='subtypeA')

In [None]:
ra_tea_t@meta.data %>% colnames() %>% sort()

### 3way wnn clustering

In [None]:
# # do 3way wnn clustering
# wnn_3way_clustering_func <- function(x, dim_list= list(1:30, 1:20, 1:30), resolution = 1){
#     DefaultAssay(x) <- 'SCT'
#   x <- FindMultiModalNeighbors(
#     x, reduction.list = list("pca", "apca", "lsit"),
#       k.nn = 20, knn.range = 100, prune.SNN = 1/20,
#     dims.list = dim_list,
#       modality.weight.name = c('SCT.weight', 'ADT.weight', 'Tiles.weight')
#   )
#   x <- RunUMAP(x, nn.name = "weighted.nn", reduction.name = "wnn.3.umap", reduction.key = "Uw3_")
#   x <- FindClusters(x, graph.name = "wsnn", algorithm = 3, resolution = resolution, verbose = TRUE, 
#                     future.seed=TRUE)
#   x
# }

In [None]:
# 3way wnn clustering
ra_tea_t <- wnn_3way_clustering_func(ra_tea_t, dim_list= list(1:30, 1:20, 1:30),  resolution = 0.5)

In [None]:
# run the different resolutions
for (res in seq(0.1, 1, 0.1)){
    ra_tea_t <- FindClusters(ra_tea_t, graph.name = "wsnn", future.seed=TRUE,
                              algorithm = 3, resolution = res, verbose = TRUE)    
}


In [None]:
seq(0.2, 1, 0.1)

In [None]:
# plot the cluster tree to check what resoluation are appropriate 
library(clustree)
# ra_tea_t@meta.data %>% colnames() %>% str_starts('wsnn_')
clustree(ra_tea_t, prefix = "wsnn_res.")
ggsave(file.path(fig_path, paste0(proj_name, '_3wnn_cluster_tree.pdf')), width=12, height=8)

In [None]:
# plot 3wnn umap
p1 <- DimPlot(ra_tea_t, label = T, reduction = 'wnn.3.umap',raster = TRUE, shuffle = TRUE,  group.by = 'wsnn_res.0.3') + NoLegend()
p2 <- DimPlot(ra_tea_t, group.by = 'predicted.celltype.l2',raster = TRUE, shuffle = TRUE,
              cols = cluster_colors, reduction = 'wnn.3.umap', label = T) 
p3 <- DimPlot(ra_tea_t, group.by = 'cohort', cols = nejm_color ,raster = TRUE, shuffle = TRUE, reduction = 'wnn.3.umap', label = T)
p4 <- DimPlot(ra_tea_t, group.by = 'subject_id',raster = TRUE, shuffle = TRUE, reduction = 'wnn.3.umap')
p1 + p2 + p3 + p4
ggsave(file.path(fig_path, paste0(proj_name, '_wsnn_res.0.3_3wnnumap.pdf')), width=12, height=8)

In [None]:
# plot_cluster_counts(all_so, 'wsnn_res.0.5', color.by='cohort', figname = '')
plot_cluster_freq(ra_tea_t, 'wsnn_res.0.4', color.by='cohort', figname = '')
plot_cluster_counts(ra_tea_t, 'wsnn_res.0.4', color.by='cohort', figname = '')

In [None]:
# save the seurat object 
ra_tea_t %>% saveRDS(file.path(data_path, 'PreRA_teaseq_seurat_t_cells.rds'))

### labeling T cell clusters

In [None]:
# plot the cannanical b cell protein markers
DefaultAssay(ra_tea_t) <- 'cleanadt'
rownames(ra_tea_t) %>% sort()
t_adts <- c('CD4', 'CD8', 'CD3', 'CD45RA', 'CD45RO', 'CD197', 'CD95', 'CD25', 'CD57', 'KLRG1') %>% sort()
# t_genes <- c('rna_TBX21', 'rna_ITGAX', 'rna_PDCD1', 'rna_PRDM1', 'rna_ERO1B', 'rna_STAT3', 'rna_IKZF3')

In [None]:
p1 <- FeaturePlot(ra_tea_t, features = t_adts, raster = TRUE, 
                     min.cutoff = "q1", max.cutoff = "q99",
            reduction = 'wnn.3.umap')
p1
ggsave(file.path(fig_path, paste0(proj_name, '_adt_genes_3wnnumap.pdf')), width=24, height=18)
p2 <- plot_density(ra_tea_t, t_adts, reduction = 'wnn.3.umap')
p2
ggsave(file.path(fig_path, paste0(proj_name, '_adts_genes_density_3wnnumap.pdf')), width=24, height=18)

#### label transfer from the ped vs senior object

In [None]:
# plot the boxplot of the clusters
dittoSeq::dittoFreqPlot(ra_tea_b, "wsnn_res.0.4",
     sample.by = "subject_id", group.by = "cohort", color.by = "cohort",
    split.adjust = list(scales = "free"))

## subset NK cells and do some intial analysis

In [None]:
proj_name <- 'PreRA_teaseq_NK'
fig_path <- '/home/jupyter/figures/preRA_teaseq/NK'
if(!dir.exists(fig_path)) (dir.create(fig_path))

In [None]:
# subset T cells from the PBMC obeject
ra_tea_nk <- subset(all_so, l1_cell_types=='NK cells')
ra_tea_nk

### reclustering on RNA

In [None]:
# redo sctransform to regress on precent.mt
DefaultAssay(ra_tea_nk) <- "RNA"
ra_tea_nk <- suppressWarnings(SCTransform(ra_tea_nk, vars.to.regress = 'percent.mt')) %>% RunPCA()

In [None]:
# check elbow plot
ElbowPlot(ra_tea_nk, ndims = 50, reduction = 'pca')

In [None]:
# run cluster in SCT
ra_tea_nk <- RunUMAP(ra_tea_nk, dims = 1:10, verbose = T) %>% 
    FindNeighbors(dims = 1:10, verbose = T) %>% 
    FindClusters(resolution = 0.5, verbose = T, future.seed = TRUE)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 10)
p1 <- DimPlot(ra_tea_nk, label = T, reduction = 'umap', group.by = 'SCT_snn_res.0.5') 
p2 <- DimPlot(ra_tea_nk, group.by = 'predicted.celltype.l2', cols = cluster_colors,
              reduction = 'umap', label = T, repel = T) 
p3 <- DimPlot(ra_tea_nk, group.by = 'subject_id', reduction = 'umap')
p4 <- DimPlot(ra_tea_nk, group.by = 'cohort', cols = nejm_color, reduction = 'umap')
p1+p2+p3+p4
ggsave(file.path(fig_path, paste0(proj_name, '_rna_umap.pdf')), width=12, height=8)

### reclustering on cleanadt

In [None]:
# run some analysis on adt data
# ADT clustering only
DefaultAssay(ra_tea_nk) <- 'cleanadt'
ra_tea_nk <- NormalizeData(ra_tea_nk, normalization.method = 'CLR', margin = 2) %>% 
         ScaleData() %>% RunPCA(reduction.name = 'apca') 
ElbowPlot(ra_tea_nk, ndims = 50, reduction = 'apca')

# all_so <- NormalizeData(all_so, normalization.method = "CLR", margin = 2)
# all_so <- adt_clustering_func(all_so, assay='cleanadt', resolution = 0.8)

In [None]:
# run umap and clustering in adt
ra_tea_nk <- RunUMAP(ra_tea_nk, dims = 1:15, reduction = 'apca', 
                  reduction.name = 'adt_umap', reduction.key='adtumap_') %>% 
        FindNeighbors(dims = 1:15, reduction = 'apca') %>% 
    FindClusters(resolution = 0.5, future.seed=TRUE)

In [None]:
options(repr.plot.width = 20, repr.plot.height = 15)
p1 <- DimPlot(ra_tea_nk, label = T, reduction = 'adt_umap', group.by = 'cleanadt_snn_res.0.5') + NoLegend()
p2 <- DimPlot(ra_tea_nk, group.by = 'predicted.celltype.l2', cols = cluster_colors, reduction = 'adt_umap', label = T, repel = T)
p3 <- DimPlot(ra_tea_nk, group.by = 'subject_id', reduction = 'adt_umap')
p4 <- DimPlot(ra_tea_nk, group.by = 'cohort',cols = nejm_color, reduction = 'adt_umap')
p1+p2+p3+p4
ggsave(file.path(fig_path, paste0(proj_name, '_cleanadt_umap.pdf')), width=12, height=8)


### clustering on ATAC

In [None]:
# # load archR data
tea_atac <- loadArchRProject(path = "/home/jupyter/data/preRA_teaseq/EXP-00243/atac_arrows")
# rerun lsi in the subset cells and load lsi to seurat
ra_tea_nk <- ExtractLSI(ra_tea_nk, tea_atac, save.archR = TRUE, 
                       dropCells = TRUE, archR.dir = '/home/jupyter/data/preRA_teaseq/EXP-00243/NK_cells')

In [None]:
# clustering on ATAC
DefaultAssay(ra_tea_nk) <- 'Tiles'
ra_tea_nk <- RunUMAP(ra_tea_nk, dims = 1:30, reduction = 'lsit', 
                    reduction.name = 'atac_umap', reduction.key='atacumap_') %>% 
        FindNeighbors(dims = 1:30, reduction = 'lsit') %>% FindClusters(resolution = 0.8, future.seed=TRUE)

In [None]:
ra_tea_nk@meta.data %>% colnames()
ra_tea_nk

In [None]:
options(repr.plot.width = 20, repr.plot.height = 15)
p1 <- DimPlot(ra_tea_nk, label = T, reduction = 'atac_umap', group.by = 'Tiles_snn_res.0.8') + NoLegend()
p2 <- DimPlot(ra_tea_nk, group.by = 'predicted.celltype.l2',cols = cluster_colors,
              reduction = 'atac_umap', label = T, repel = T) 
p3 <- DimPlot(ra_tea_nk, group.by = 'subject_id', reduction = 'atac_umap')
p4 <- DimPlot(ra_tea_nk, group.by = 'cohort', cols = nejm_color , reduction = 'atac_umap')
p1+p2+p3+p4
ggsave(file.path(fig_path, paste0(proj_name, '_all_cells_atac_umap.pdf')), width=12, height=8)

In [None]:
# plot_cluster_freq(ra_tea_nk, cluster.name = 'Tiles_snn_res.0.8', color.by = 'cohort')
# plot_cluster_freq(all_so, cluster.name = 'Tiles_snn_res.0.8', color.by = 'cohort', figname='subtypeA')
# plot_cluster_counts(all_so, cluster.name = 'predicted.MonocyteSubsets', color.by = 'cohort', figname='subtypeA')

In [None]:
ra_tea_nk@meta.data %>% colnames() %>% sort()

### 3way wnn clustering

In [None]:
# # do 3way wnn clustering
# wnn_3way_clustering_func <- function(x, dim_list= list(1:30, 1:20, 1:30), resolution = 1){
#     DefaultAssay(x) <- 'SCT'
#   x <- FindMultiModalNeighbors(
#     x, reduction.list = list("pca", "apca", "lsit"),
#       k.nn = 20, knn.range = 100, prune.SNN = 1/20,
#     dims.list = dim_list,
#       modality.weight.name = c('SCT.weight', 'ADT.weight', 'Tiles.weight')
#   )
#   x <- RunUMAP(x, nn.name = "weighted.nn", reduction.name = "wnn.3.umap", reduction.key = "Uw3_")
#   x <- FindClusters(x, graph.name = "wsnn", algorithm = 3, resolution = resolution, verbose = TRUE, 
#                     future.seed=TRUE)
#   x
# }

In [None]:
# 3way wnn clustering
ra_tea_nk <- wnn_3way_clustering_func(ra_tea_nk, dim_list= list(1:10, 1:15, 1:30),  resolution = 0.5)

In [None]:
# run the different resolutions
for (res in seq(0.1, 1, 0.1)){
    ra_tea_nk <- FindClusters(ra_tea_nk, graph.name = "wsnn", future.seed=TRUE,
                              algorithm = 3, resolution = res, verbose = TRUE)    
}


In [None]:
seq(0.2, 1, 0.1)

In [None]:
# plot the cluster tree to check what resoluation are appropriate 
library(clustree)
# ra_tea_nk@meta.data %>% colnames() %>% str_starts('wsnn_')
clustree(ra_tea_nk, prefix = "wsnn_res.")
ggsave(file.path(fig_path, paste0(proj_name, '_3wnn_cluster_tree.pdf')), width=12, height=8)

In [None]:
# plot 3wnn umap
p1 <- DimPlot(ra_tea_nk, label = T, reduction = 'wnn.3.umap',  group.by = 'wsnn_res.0.6') + NoLegend()
p2 <- DimPlot(ra_tea_nk, group.by = 'predicted.celltype.l2',cols = cluster_colors, reduction = 'wnn.3.umap', label = T) 
p3 <- DimPlot(ra_tea_nk, group.by = 'cohort', cols = nejm_color , reduction = 'wnn.3.umap', label = T)
p4 <- DimPlot(ra_tea_nk, group.by = 'subject_id', reduction = 'wnn.3.umap')
p1 + p2 + p3 + p4
ggsave(file.path(fig_path, paste0(proj_name, '_wsnn_res.0.6_3wnnumap.pdf')), width=12, height=8)

In [None]:
# plot_cluster_counts(all_so, 'wsnn_res.0.5', color.by='cohort', figname = '')
plot_cluster_freq(ra_tea_nk, 'wsnn_res.0.6', color.by='cohort', figname = '')
plot_cluster_counts(ra_tea_nk, 'wsnn_res.0.6', color.by='cohort', figname = '')

In [None]:
# save the seurat object 
ra_tea_nk %>% saveRDS(file.path(data_path, 'PreRA_teaseq_seurat_NK_cells.rds'))

### labeling NK cell clusters
- label transfer from the ped vs senior object

In [None]:
# plot the cannanical b cell protein markers
DefaultAssay(ra_tea_b) <- 'cleanadt'
rownames(ra_tea_b) %>% sort()
b_adts <- c('CD19', 'CD20', 'CD38', 'CD24', 'CD27', 'CD10', 'CD319',
            'CD11c','IgD', 'IgE', 'IgM', 'Ig-light-chain-k', 'Ig-light-chain-l') %>% sort()
b_genes <- c('rna_TBX21', 'rna_ITGAX', 'rna_PDCD1', 'rna_PRDM1', 'rna_ERO1B', 'rna_STAT3', 'rna_IKZF3')

In [None]:
p1 <- FeaturePlot(ra_tea_b, features = c(b_adts, b_genes), raster = FALSE, 
                     min.cutoff = "q1", max.cutoff = "q99",
            reduction = 'wnn.3.umap')
p1
ggsave(file.path(fig_path, paste0(proj_name, '_adt_genes_3wnnumap.pdf')), width=24, height=18)
p2 <- plot_density(ra_tea_b, c(b_adts, b_genes), reduction = 'wnn.3.umap')
p2
ggsave(file.path(fig_path, paste0(proj_name, '_adts_genes_density_3wnnumap.pdf')), width=24, height=18)

In [None]:
# plot the boxplot of the clusters
dittoSeq::dittoFreqPlot(ra_tea_b, "wsnn_res.0.4",
     sample.by = "subject_id", group.by = "cohort", color.by = "cohort",
    split.adjust = list(scales = "free"))

## subset major cell types by 3wnn

In [None]:
# plot 3wnn umap
p1 <- DimPlot(all_so, label = T, reduction = 'wnn.3.umap',  group.by = 'wsnn_res.0.5') + NoLegend()
p2 <- DimPlot(all_so, group.by = 'predicted.celltype.l2', cols = cluster_colors,
              reduction = 'wnn.3.umap', label = T) 
p3 <- DimPlot(all_so, group.by = 'cohort', cols = nejm_color, reduction = 'wnn.3.umap', label = T) + NoLegend()
p4 <- DimPlot(all_so, group.by = 'subject_id', reduction = 'wnn.3.umap')
p1 + p2 + p3 + p4
ggsave(file.path(fig_path, paste0(proj_name, '_l2_labels_wsnn_res.0.5_3wnnumap.pdf')), width=12, height=8)

In [None]:
all_so@meta.data %>% colnames()
all_so

In [None]:
p1 <- scCustomize::DimPlot_scCustom(all_so, label = FALSE, repel=TRUE,raster = TRUE,
                                    reduction = 'umap', colors_use = nejm_color,
              group.by = 'cohort') + NoLegend() + ggtitle('RNA')+ NoAxes()
p2 <- scCustomize::DimPlot_scCustom(all_so, label = FALSE, repel=TRUE,raster = TRUE,
                                    reduction = 'adt_umap', colors_use = nejm_color,
              group.by = 'cohort') + NoLegend()+ ggtitle('Protein')+ NoAxes()
p3 <- scCustomize::DimPlot_scCustom(all_so, label = FALSE, repel=TRUE,raster = TRUE,
                                    reduction = 'atac_umap', colors_use = nejm_color,
              group.by = 'cohort')+ ggtitle('ATAC')+ NoLegend() + NoAxes()
p4 <- scCustomize::DimPlot_scCustom(all_so, label = FALSE, repel=TRUE,raster = TRUE,
                                    reduction = 'wnn.3.umap', colors_use = nejm_color,
              group.by = 'cohort')+ ggtitle('wnn.3.umap')+ NoLegend() + NoAxes()
p1+p2+p3+p4+ plot_layout(ncol = 2)
ggsave(file.path(fig_path, paste0(proj_name, '_cohort_3modalities_umaps.pdf')), width=8, height=6)

In [None]:
p4 <- scCustomize::DimPlot_scCustom(all_so, label = FALSE, repel=TRUE,raster = TRUE,
                                    reduction = 'wnn.3.umap', colors_use = nejm_color,
              group.by = 'cohort')+ ggtitle('wnn.3.umap') + NoAxes()
p4

In [None]:
p1 <- scCustomize::DimPlot_scCustom(all_so, label = TRUE, repel=TRUE, raster = TRUE,
                                    reduction = 'wnn.3.umap', colors_use = cluster_colors_ext,
              group.by = 'l2_cell_types') + NoLegend() 
p1
ggsave(file.path(fig_path, paste0(proj_name, '_l2_cell_types_3wnnumap_unlabelled.pdf')), width=5, height=5)

In [None]:
# plot all umap
p1 <- DimPlot(all_so, label = T, reduction = 'wnn.3.umap',  group.by = 'wsnn_res.0.5') + NoLegend()
p2 <- DimPlot(all_so, group.by = 'predicted.celltype.l2', cols = cluster_colors,
              reduction = 'wnn.3.umap', label = T) 
p3 <- DimPlot(all_so, group.by = 'cohort', cols = nejm_color, reduction = 'wnn.3.umap', label = T) + NoLegend()
p4 <- DimPlot(all_so, group.by = 'subject_id', reduction = 'wnn.3.umap')
p1 + p2 + p3 + p4
ggsave(file.path(fig_path, paste0(proj_name, '_l2_labels_wsnn_res.0.5_3wnnumap.pdf')), width=12, height=8)

## subset only the myeloid cells from intersect atac and rna labels

In [None]:
# load atac labels for the mononocytes
atac_labels <- read_csv(file.path(meta_path, 
                             'RA_TEAseq_Monocyte_SubtypeLabeledMetadata.csv'), 
                   col_types = cols()) %>% rename('sample_id_barcode' = '...1')
atac_labels %>% dim()
atac_labels %>% colnames()
all(atac_labels$barcodes %in% all_so@meta.data$barcodes)

In [None]:
match_barcodes <- c(atac_labels$barcodes %in% all_so@meta.data$barcodes)
# check how many cells in atac labels were missing in the seurat data
length(match_barcodes[match_barcodes== FALSE])

In [None]:
all_so@meta.data %>% group_by(predicted.celltype.l2) %>% tally()
all_so@meta.data %>% filter(barcodes %in% atac_labels$barcodes)%>% group_by(predicted.celltype.l2) %>% tally()

In [None]:
# select columns to keep in atac labels
atac_labels %>% colnames()
atac_labels.keep <- atac_labels %>% select(c(barcodes, nFrags, Clusters,umap_1, umap_2,
                                             predicted.MonocyteSubsets, predicted.MonocyteSubsets.score)) %>% 
    rename('atac_clusters'='Clusters','atac_umap1'= 'umap_1','atac_umap2'= 'umap_2')
atac_labels.keep %>% head()

In [None]:
all_so@meta.data  %>% distinct(predicted.celltype.l2)

In [None]:
# subset myeloid cells subset predicted by atac
myeloid_labels <- all_so@meta.data %>% 
    dplyr::filter( (barcodes%in% atac_labels$barcodes) | 
                  (predicted.celltype.l2 %in% c('CD14 Mono', 'CD16 Mono','pDC','cDC2', 'cDC1', 'HSPC', 'ASDC'))) %>% 
    left_join(atac_labels.keep, by='barcodes')
myeloid_labels %>% group_by(predicted.celltype.l2) %>% tally()
myeloid_labels %>% group_by(predicted.MonocyteSubsets) %>% tally()
myeloid_labels %>% nrow()

In [None]:
all(myeloid_labels$barcodes %in% all_so@meta.data$barcodes)

In [None]:
myeloid_labels %>% distinct(pbmc_sample_id)

In [None]:
# subset the myeloid subset
all_so_mye <- subset(all_so, subset = barcodes%in% myeloid_labels$barcodes)

In [None]:
# add addtional metadata from atac to the object
cell_id <- all_so_mye@meta.data %>% rownames()
all_so_mye@meta.data <- myeloid_labels
rownames(all_so_mye@meta.data) <- cell_id

In [None]:
all_so_mye@meta.data %>% distinct(well_id)

In [None]:
all_so_mye

In [None]:
# save the seurat obeject 
all_so_mye %>% saveRDS(file.path(data_path, paste0(proj_name, '_seurat_myeloid_cells.rds')))