In [26]:
source("~/sc-online/extraGenes.R")
source("~/sc-online/myExpSetCreatorFn.R")
source("~/sc-online/extraGenes.R")
source("~/sc-online/myExpSetCreatorFn.R")
source("~/sc-online/utils.R")
source("~/sc-online/plot.R")
source("~/sc-online/clust.R")
source("~/sc-online/labelTransfer.R")
source("~/sc-online/getData.R")

library(caret)
library("DropletUtils")
library(dplyr)
library(ggplot2)
library(grid)
library(gridExtra)
library(gtable)
library(harmony)
library(lisi)
library(Matrix)
library(patchwork)
library(pheatmap)
library(qs)
library(RColorBrewer)
library(SingleCellExperiment)
library(rhdf5)
library(rlang)

library(Seurat)
library(tidyr)
library(xml2)

library(viridis)
library(viridisLite)

qs 0.25.7



In [2]:
BASE_PATH = "/mnt/accessory/seq_data"

VIREO_DONOR_IDS_BASENAME = 'vireo_outs/donor_list/donor_ids.tsv'
CB_SCE_BASENAME = "ingested_data/cb_data_sce_FPR_0.01.rds"
SUMMARY_BASENAME = 'vireo_outs/donor_list/summary.tsv'

# Set relatively conservative thresholds for initial filtering
PCT_INTRONIC_MIN = 20
PCT_MT_MAX = 10


In [25]:
calico_libs_long = readLines("~/calico-libs-long.txt")
calico_libs_long = calico_libs_long[calico_libs_long != ""]
calico_libs = lapply(calico_libs_long, function(x) {
    split = strsplit(x, split = "_")[[1]]
    return(paste(split[2:length(split)], collapse = "_"))
})
names(calico_libs_long) = calico_libs

gtex_libs_long = readLines("~/gtex-libs-long.txt")
gtex_libs_long = gtex_libs_long[gtex_libs_long != ""]
gtex_libs = lapply(gtex_libs_long, function(x) {
    split = strsplit(x, split = "_")[[1]]
    return(paste(split[2:length(split)], collapse = "_"))
})
names(gtex_libs_long) = gtex_libs

libs_long = c(calico_libs_long, gtex_libs_long)
libs = c(calico_libs, gtex_libs)

In [None]:
# can change this later
log10_nUMI_thresholds = rep(3.0, length(calico_libs))
names(log10_nUMI_thresholds) = calico_libs

dapi_nurr = list(
    'nurr', 'dapi', 'nurr', 'dapi',
    'nurr', 'nurr', 'dapi',
    'nurr', 'nurr', 'dapi',
    'nurr', 'nurr', 'dapi',
    'nurr', 'nurr', 'dapi',
    'nurr', 'nurr', 'dapi',
    'nurr', 'nurr', 'dapi',
    'nurr', 'nurr', 
    'dapi', 'dapi', 'nurr',
    'nurr', 'nurr', 'dapi',
    'dapi', 'dapi', 'dapi',
    'nurr', 'nurr', 'dapi')
names(dapi_nurr) = calico_libs

short_lib_names = list(
    '19B', '19BD', '19C', '19CD',
    '19D1', '19D2', '19DD',
    '19E1', '19E2', '19ED',
    '19F1', '19F2', '19FD',
    '19G1', '19G2', '19GD',
    '19I1', '19I2', '19ID',
    '19J1', '19J2', '19JD',
    '19K1', '19K2',
    '21A8', '21A9', 
    '21B8', '21B9',
    '21C8', '21C9', 
    '21D8', '21D9',
    '21E8', '21F8', '21G8', '21H8'   
)
names(short_lib_names) = libs



In [None]:
calico_sce_list = loadCbSceList(libs,
    pct_mt_max=PCT_MT_MAX,
    pct_intronic_min= PCT_INTRONIC_MIN,
    log10_nUMI_threshold_list=log10_nUMI_thresholds
)

In [None]:
hist(colData(calico_sce_list[[1]])$pct_intronic, breaks = 100)

In [None]:
calico_all = .mycBindFn(calico_sce_list)
calico_nurr = calico_all[, colData(calico_all)$sort == 'nurr']
calico_dapi = calico_all[, colData(calico_all)$sort == 'dapi']

cb_sce_nurr_donor_list = .mySplitObject(calico_nurr, "donor_id")
cb_sce_dapi_donor_list = .mySplitObject(calico_dapi, "donor_id")

In [None]:
qsave(cb_sce_nurr_donor_list, file.path(BASE_PATH, "cb_sce_donor_list_nurr_20240304.qs"))
qsave(cb_sce_dapi_donor_list, file.path(BASE_PATH, "cb_sce_donor_list_dapi_20240304.qs"))


In [None]:
DefaultAssay(object = seurat.object)

In [None]:
length(colnames(cb_sce_nurr_donor_list)) - length(unique(colnames(cb_sce_nurr_donor_list)))

In [None]:
source("~/sc-online/getData.R")
seurat_nurr_merged = rawSceToHarmonizedSeurat(cb_sce_nurr_donor_list)

In [None]:
options(repr.plot.width=13, repr.plot.height=13)
DimPlot(seurat_nurr_merged, label=TRUE)

In [None]:
DimPlot(seurat_nurr_merged, group.by='short_donor_id')

In [None]:
FeaturePlot(seurat_nurr_merged, c("SLC6A3", "TH", "DDC", "NR4A2"))

In [None]:
ndonors = list()
for (i in 0:(length(unique(seurat_nurr_merged$seurat_clusters))-1)){
    this_cluster = seurat_nurr_merged[, seurat_nurr_merged$seurat_clusters == paste0(i)]
    ndonors[[i+1]] = length(unique(this_cluster$short_donor_id))
}

In [None]:
names(ndonors) = sapply((0:(length(ndonors)-1)), paste0)
ndonors_gt1 = ndonors[ndonors > 1]
ndonors_gt1_clusters = names(ndonors_gt1)
ndonors_gt1_clusters

In [None]:
# remove donor_specific clusters 
seurat_nurr_merged_multidonor_clusters = seurat_nurr_merged[, seurat_nurr_merged$seurat_clusters %in% ndonors_gt1_clusters]

In [None]:
dim(seurat_nurr_merged)
dim(seurat_nurr_merged_multidonor_clusters)

In [None]:
seurat_dapi_merged = rawSceToHarmonizedSeurat(cb_sce_dapi_donor_list)

In [None]:
DimPlot(seurat_dapi_merged, label=TRUE)

In [None]:
seurat_nurr_merged = readRDS(file.path(BASE_PATH, "seurat_nurr_merged_initial_harmonized_20240119.rds"))
seurat_dapi_merged = readRDS(file.path(BASE_PATH, "seurat_dapi_merged_initial_harmonized_20240119.rds"))


In [None]:
# (10) Find markers
plan(strategy = "multicore", workers = 28)

# Find all markers; adjust parameters as needed
nurr_markers = FindAllMarkers(
    seurat_nurr_merged, 
    only.pos=TRUE)

saveRDS(nurr_markers, file=file.path(BASE_PATH, "markers_nurr_initial_harmonized_20240130.rds"))


In [None]:
dapi_markers = FindAllMarkers(
    seurat_dapi_merged, 
    only.pos=TRUE)

saveRDS(dapi_markers, file=file.path(BASE_PATH, "markers_dapi_initial_harmonized_20240119.rds"))

In [None]:
nurr_markers = readRDS(file.path(BASE_PATH, "markers_nurr_initial_harmonized_20240119.rds"))
dapi_markers = readRDS(file.path(BASE_PATH, "markers_dapi_initial_harmonized_20240119.rds"))

In [None]:
astro = "astro"
endo = "endo"
mg = "mg"
da = "da"
nonda = "nonda"
oligo = "oligo"
opc = "opc"
mix = "mix"
mito = "mito"
none = "none"
immune = "immune"
ependymal = "ependymal"

In [None]:
for (i in 1:(max(as.numeric(nurr_markers$cluster))-1)){
    print(i)
    c = nurr_markers[nurr_markers$cluster == i, ]
    # order by pct.1 descending
    c = c[order(c$pct.1, decreasing = TRUE),]
    print(c[, c("gene", "avg_log2FC", "pct.1", "pct.2")])
    print("::::::::::::::::::::::")
}

In [None]:
setwh(8, 8)
DimPlot(seurat_nurr_merged, label=TRUE)

In [None]:
for (i in 0:max(as.numeric(nurr_markers$cluster)-1)){
    printMarkersByCluster(nurr_markers, cluster=i)
}


In [None]:

getClusterLogFC = function(seurat_obj, cluster){
    clust_genesums = rowSums(seurat_obj[, seurat_obj$seurat_clusters==cluster])
    non_clust_genesums = rowSums(seurat_obj[, seurat_obj$seurat_clusters!=cluster])
    logfc_clust = round(log2((clust_genesums + 1) / (non_clust_genesums + 1)), 3)
    return(sort(logfc_clust, decreasing = TRUE))
}

getClusterLogFC(seurat_nurr_merged, 24)[1:20]

In [None]:
ndonors_gt1_clusters

In [None]:
for (i in 30:63){
    if (! paste0(i) %in% ndonors_gt1_clusters){next}
    print(i)
    print(getClusterLogFC(seurat_nurr_merged, i)[1:15])
}

In [None]:

nurr_classes = list(
    "immune", "da", 'nonda', 'nonda', 'oligo' #0-4
    , "nonda", 'nonda', 'nonda', 'da', 'nonda' #5-9 
    , 'nonda', 'nonda', 'astro', 'nonda', 'da' #10-14
    , 'nonda', 'nonda', 'da', 'endo', 'none' #15-19
    , 'endo', 'da', 'nonda', 'mix', 'none' #20-24
    , 'nonda', 'endo', 'nonda', 'opc', 'nonda' #25-29
    , 'immune', 'none', 'nonda' #30-32
)
nurr_classes[33:64] = 'none'



In [None]:
nurr_classes

In [None]:
seurat_nurr_merged = assignCellClasses(seurat_nurr_merged, nurr_classes)
#DimPlot(seurat_nurr_merged, group.by="cell_class")
DimPlot(seurat_nurr_merged[, ! seurat_nurr_merged$cell_class %in% c("none", "mix")], group.by="cell_class")

In [None]:
dim(seurat_nurr_merged_clean)

In [None]:
length(seurat_nurr_merged@meta.data$donor_id_barcode) - length(unique(seurat_nurr_merged@meta.data$donor_id_barcode))

In [None]:
seurat_nurr_merged@meta.data[rownames(seurat_nurr_merged@meta.data) != seurat_nurr_merged@meta.data$donor_id_barcode, ]

In [None]:
head(seurat_nurr_merged_clean@meta.data["206954930010_R02C01_AAACGCTCAATCTGCA-1", ])

In [None]:
head(md)

In [None]:
head(new_md)

In [None]:
head(seurat_nurr_merged_clean@meta.data)

In [None]:
sum(is.na(seurat_nurr_merged_clean$donor_id))

In [None]:
# we are now going to clean the seurat_nurr_merged in two ways
# 1. remove clusters without a cell class assignment
# 2. apply metadata; remove donors without a case-control status

seurat_nurr_merged_clean = seurat_nurr_merged[, seurat_nurr_merged$cell_class %in% c("astro", "da", "endo", "immune", "nonda", "oligo", "opc")]

# do not know when or how duplicate donor_id_barcodes are being introduced
seurat_nurr_merged_clean = seurat_nurr_merged_clean[, !duplicated(seurat_nurr_merged_clean$donor_id_barcode)]
seurat_nurr_merged_clean = seurat_nurr_merged_clean[ ,!is.na(seurat_nurr_merged_clean$cell_class)]

rows_to_exclude_TMP = list(
    "206954930010_R11C01",
    "206954930011_R11C01"
)

md = read.table("/mnt/accessory/seq_data/calico/Calico_Macosko_PD_GSA_Terra_Manifest.tsv", header = T, sep = "\t")
md = md[!md$chip_well_barcode %in% rows_to_exclude_TMP, c("chip_well_barcode", "Disease", "Age", "Sex")]
colnames(md) = c("donor_id", "status", "age", "sex")

md_clean = md[complete.cases(md),]

new_md = merge(
        x=seurat_nurr_merged_clean@meta.data,
        y=md_clean,
        by="donor_id",
        all.x = TRUE
    )
rownames(new_md) = new_md$donor_id_barcode
seurat_nurr_merged_clean@meta.data = new_md

seurat_nurr_merged_clean = seurat_nurr_merged_clean[, ! is.na(seurat_nurr_merged_clean$status)]
seurat_nurr_merged_clean$case_control = 'ctr'
seurat_nurr_merged_clean$case_control[seurat_nurr_merged_clean$status != 'Control'] = 'pd'

setwh(12,12)
DimPlot(seurat_nurr_merged_clean, group.by="cell_class")
DimPlot(seurat_nurr_merged_clean, label=T)
DimPlot(seurat_nurr_merged_clean, group.by="case_control")

In [None]:
seurat_nurr_merged_clean$case_control = 'ctr'
seurat_nurr_merged_clean$case_control[seurat_nurr_merged_clean$status != 'Control'] = 'pd'


In [None]:
setwh(14, 14)
# DimPlot(seurat_nurr_merged_clean, label=T)
# DimPlot(seurat_nurr_merged_clean, group.by="cell_class")
DimPlot(seurat_nurr_merged_clean, group.by="case_control")

In [None]:
library(qs)

In [None]:
qsave(seurat_nurr_merged_clean, "/mnt/accessory/seq_data/calico/seurat_nurr_merged_initial_harmonized_20240130.qs")

: 

In [None]:
options(repr.plot.width=16, repr.plot.height=8)
DimPlot(seurat_nurr_merged, label=T)
options(repr.plot.width=8, repr.plot.height=8)

In [None]:
nurr_classes = list(
    none, da, nonda, da, nonda, 
    nonda, none, nonda, nonda, nonda, 
    da, nonda, nonda, astro, nonda, #10
    nonda, nonda, mito, none, none, 
    nonda, nonda, da, opc, none,    #20
    mito, none, nonda, nonda, nonda, 
    astro, nonda, none, none, none, #30
    none, ependymal, nonda, none, mix, 
    mix, none, none, nonda, none 
    )

seurat_nurr_merged = assignCellClasses(seurat_nurr_merged, nurr_classes)
DimPlot(seurat_nurr_merged[, ! seurat_nurr_merged$cell_class %in% c(none, mix)], group.by="cell_class")

In [None]:

FeaturePlot(seurat_nurr_merged, features = "QC_MT.pct")


In [None]:
FeaturePlot(seurat_nurr_merged, features = "log10_nUMI")

In [None]:
FeaturePlot(seurat_nurr_merged, features = "SLC17A6")

In [None]:
FeaturePlot(seurat_nurr_merged, features = "GAD1")

In [None]:
FeaturePlot(seurat_nurr_merged, features = "SLC6A3")

In [None]:
FeaturePlot(seurat_nurr_merged, features = "GPNMB")

In [None]:
FeaturePlot(seurat_nurr_merged, features = "PLAT")

In [None]:
FeaturePlot(seurat_nurr_merged, features = "CX3CR1")

In [None]:
FeaturePlot(seurat_nurr_merged, features = "pct_intronic")

In [None]:
for (i in 0:max(as.numeric(dapi_markers$cluster)-1)){
    printMarkersByCluster(dapi_markers, cluster=i)
}

In [None]:
DimPlot(seurat_dapi_merged[, ! seurat_dapi_merged$cell_class %in% c(mito, mix, none)], label=T)

In [None]:
sum(! seurat_dapi_merged$cell_class %in% c(mito, mix, none, NA)) / ncol(seurat_dapi_merged)
sum(! seurat_nurr_merged$cell_class %in% c(mito, mix, none, NA)) / ncol(seurat_nurr_merged)

In [None]:

dapi_classes = list(
    oligo, mg, oligo, oligo, astro, #0
    oligo, opc, nonda, astro, oligo,#5
    da, endo, endo, immune, nonda,#10
    astro, nonda, none, mito, none,#15
    mg, mg, nonda, opc, oligo,#20
    nonda, immune, opc, none#25   
    )

seurat_dapi_merged = assignCellClasses(seurat_dapi_merged, dapi_classes)
DimPlot(seurat_dapi_merged[, ! seurat_dapi_merged$cell_class %in% c(mix, none, NA)], group.by="cell_class")

In [None]:
saveRDS(seurat_dapi_merged, file=file.path(BASE_PATH, "seurat_dapi_merged_initial_harmonized_20240119.rds"))

In [None]:
pct_clust1 = (
    rowSums(seurat_dapi_merged[, seurat_dapi_merged$seurat_clusters==1] > 0) /
    ncol(seurat_dapi_merged[, seurat_dapi_merged$seurat_clusters==1])
)

In [None]:

getClusterLogFC = function(seurat_obj, cluster){
    clust_genesums = rowSums(seurat_obj[, seurat_obj$seurat_clusters==cluster])
    non_clust_genesums = rowSums(seurat_obj[, seurat_obj$seurat_clusters!=cluster])
    logfc_clust = log2((clust_genesums + 1) / (non_clust_genesums + 1))
    return(sort(logfc_clust, decreasing = TRUE))
}

getClusterLogFC(seurat_dapi_merged, 1)[1:50]



In [None]:
getClusterLogFC(seurat_dapi_merged, 17)[1:100]
getClusterLogFC(seurat_dapi_merged, 19)[1:100]
getClusterLogFC(seurat_dapi_merged, 28)[1:100]

In [None]:
# what are the genes that have the highest enrichment in cluster 1?
log(clust1_genesums) - log(1 + non_clust1_genesums)

In [None]:
DimPlot(seurat_dapi_merged, group.by="cell_class")

In [None]:
FeaturePlot(seurat_dapi_merged,
    features="log10_nUMI")

In [None]:
FeaturePlot(seurat_dapi_merged,
    features="QC_MT.pct")

In [None]:
FeaturePlot(seurat_dapi_merged,
    features="pct_intronic")

In [None]:
FeaturePlot(seurat_dapi_merged[, ! seurat_dapi_merged$cell_class %in% (c(mix, none, NA))],
    features="SLC6A3")

In [None]:
FeaturePlot(seurat_dapi_merged[, ! seurat_dapi_merged$cell_class %in% (c(mix, none, NA))],
    features="SYT1")

In [None]:
FeaturePlot(seurat_dapi_merged[, ! seurat_dapi_merged$cell_class %in% (c(mix, none, NA))],
    features="GAD1")

In [None]:
FeaturePlot(seurat_dapi_merged[, ! seurat_dapi_merged$cell_class %in% (c(mix, none, NA))],
    features="GPNMB")

In [None]:
FeaturePlot(seurat_dapi_merged[, ! seurat_dapi_merged$cell_class %in% (c(mix, none, NA))],
    features="GFAP")

In [None]:
FeaturePlot(seurat_dapi_merged[, ! seurat_dapi_merged$cell_class %in% (c(mix, none, NA))],
    features="OLIG1")

In [None]:
FeaturePlot(seurat_dapi_merged[, ! seurat_dapi_merged$cell_class %in% (c(mix, none, NA))],
    features="OLIG2")

In [None]:
FeaturePlot(seurat_dapi_merged[, ! seurat_dapi_merged$cell_class %in% (c(mix, none, NA))],
    features="THEMIS")

In [None]:
FeaturePlot(seurat_dapi_merged[, ! seurat_dapi_merged$cell_class %in% (c(mix, none, NA))],
    features="SPP1")

In [None]:
# 20240119 markers
# nurr_classes = list(
#     none, da, nonda, da, nonda, 
#     nonda, none, nonda, nonda, nonda, 
#     da, nonda, nonda, astro, nonda, #10
#     nonda, nonda, mito, none, none, 
#     nonda, nonda, da, opc, none,    #20
#     mito, none, nonda, nonda, nonda, 
#     astro, nonda, none, none, none, #30
#     none, ependymal, nonda, none, mix, 
#     mix, none, none, nonda, none 
#     )