# Preprocessing
Data published in [Shiraishi et al, 2024, Cancer-specific epigenome identifies oncogenic hijacking by nuclear factor I family proteins for medulloblastoma progression](https://pubmed.ncbi.nlm.nih.gov/38834071/).  
Available in [GSE243609](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE243609).  

Notebook performs standard preprocessing including QC on mitochondrial reads, read depth and read complexity.  
Outputs a list of 3 seuratobjects [gnp, pnc, tumor] to `out/seuratobject_list.rds`.  
`m_` prefixed variables indicate transformations regressing on percent.mt.

In [None]:
Sys.setenv(LANGUAGE = "en") # set language to "ja" if you prefer

library(Seurat)
library(tidyverse)
library(harmony)
library(ggplot2)
library(future)
library(mclust)
library(patchwork)
library(ggalluvial)
library(ggrepel)
library(RColorBrewer)
library(slingshot)
library(igraph)
library(ggbeeswarm)

set.seed(47);

# Data processing and clustering

In [None]:
# Set parallel execution settings
future::plan("multisession", workers = as.integer(availableCores()/2), gc=TRUE)
options(future.globals.maxSize = 1024*8*1024^2) # Set max variable size to 8Gb

In [None]:
# Load data functions
DATA_DIR = file.path('data','GSE243609') # change this if you put your data somewhere other than ./data/GSE243609/
DATA_LOCATIONS <- list(
    gnp = c(file.path(DATA_DIR,'GSM7791840_032103_filtered_feature_bc_matrix.h5'),
           file.path(DATA_DIR,'GSM7791839_032103_fragments.tsv.gz')),
    pnc = c(file.path(DATA_DIR,'GSM7791842_RS03056_filtered_feature_bc_matrix.h5'),
            file.path(DATA_DIR,'GSM7791841_RS03056_fragments.tsv.gz')),
    tumor = c(file.path(DATA_DIR,'GSM7791844_RS03060_filtered_feature_bc_matrix.h5'),
              file.path(DATA_DIR,'GSM7791843_RS03060_fragments.tsv.gz'))
)

read_data_into_seurat <- function(experiment){
    # experiment should be 'gnp', 'pnc', or 'tumor'
    data <- Seurat::Read10X_h5(DATA_LOCATIONS[[experiment]][1])
    print('loading RNA data...')
    obj <- CreateSeuratObject(
        counts=data$'Gene Expression',
        project=experiment,
        assay='RNA',
        min.cells = 10,
        min.features=100
    )
    return(obj)
}

read_annot_helper <- function(df,prefix){
    df <- df %>% 
        filter(str_detect(cell, prefix)) %>%
        mutate(cell = sub(".*#", "", cell)) %>%
        column_to_rownames(., var = "cell")

    return(df)
}

read_cell_annotations <- function(path='data/GSE243609/20241018_scRNA_CellTypeAnno_Table.csv'){
    df = readr::read_csv(path,show_col_types = FALSE)
    annot = list()
    annot$gnp = read_annot_helper(df,"^032103")
    annot$pnc = read_annot_helper(df,"^RS03056")
    annot$tumor = read_annot_helper(df,"^RS03060")
    return(annot)
}

In [None]:
# Read seurat objects
gnp_data = read_data_into_seurat('gnp')
pnc_data = read_data_into_seurat('pnc')
tumor_data = read_data_into_seurat('tumor')

# format as list of SO
sc_list <- list(
    gnp = gnp_data,
    pnc = pnc_data,
    tumor = tumor_data
)
sc_list

# Annotate with cell ids
annot = read_cell_annotations()
for (sample in names(sc_list)){
    sc_list[[sample]]$annotation <- annot[[sample]]
}

In [None]:
## QC
qc <- function(sc_list){
    # TODO: make plots pretty colors
    # report number of low-quality cells dropped
    plots = list()
    for (name in names(sc_list)){
        x <- sc_list[[name]]

        # define useful qc metrics
        x[["percent.mt"]] <- PercentageFeatureSet(x, pattern = "^mt-")
        x[["log_count_rna"]] <- log(x$nCount_RNA)
        x[["log_features_rna"]] <- log(x$nFeature_RNA)

        # set thresholds
        ncount_lower_threshold = exp(mean(x$log_count_rna) - 2*sd(x$log_count_rna))
        ncount_upper_threshold = exp(mean(x$log_count_rna) + 2*sd(x$log_count_rna))
        nfeature_lower_threshold = exp(mean(x$log_features_rna) - 2*sd(x$log_features_rna))
        nfeature_upper_threshold = exp(mean(x$log_features_rna) + 2*sd(x$log_features_rna))
        mitochondrial_threshold = mean(x$percent.mt) + 2*sd(x$percent.mt)

        # plot thresholds
        plots[[paste(name,"counts",sep="_")]] <- VlnPlot(x, features="nCount_RNA", log=TRUE, layer="counts") + 
            geom_hline(yintercept=ncount_lower_threshold, color='red') + 
            geom_hline(yintercept=ncount_upper_threshold, color='red')
        plots[[paste(name,"features",sep="_")]] <- VlnPlot(x, features="nFeature_RNA", log=TRUE, layer="counts") +
            geom_hline(yintercept=nfeature_lower_threshold,color='red') + 
            geom_hline(yintercept=nfeature_upper_threshold,color='red')
        plots[[paste(name,"mitochondria",sep="_")]] <- VlnPlot(x, features="percent.mt", log=TRUE, layer="counts") + 
            geom_hline(yintercept=mitochondrial_threshold,color='red')

        # apply thresholds
        sc_list[[name]] <- x %>% 
            subset(nFeature_RNA > nfeature_lower_threshold & nFeature_RNA < nfeature_upper_threshold & 
                   nCount_RNA > ncount_lower_threshold & nCount_RNA < ncount_upper_threshold & 
                   percent.mt < mitochondrial_threshold
                  )
    }
    options(repr.plot.width = 12, repr.plot.height = 12)
    combined_plot <- wrap_plots(plots) + plot_layout(ncol = 3)  # Adjust ncol as needed
    print(combined_plot)
    return(sc_list)
}
sc_list <- qc(sc_list)

In [None]:
standard_seurat_analysis <- function(seuratobj){
    return (seuratobj %>%
        SCTransform(verbose = FALSE, vst.flavor="v2") %>%
        RunPCA(verbose = FALSE) %>%
        RunUMAP(dims = 1:30, verbose = FALSE) %>%
        FindNeighbors(dims = 1:30, verbose = FALSE) %>%
        FindClusters(verbose = FALSE)
    )
}
regress_mito_seurat_analysis <- function(seuratobj){
    return (seuratobj %>%
        SCTransform(verbose = FALSE, vst.flavor="v2",vars.to.regress = "percent.mt", new.assay.name = "m_SCT") %>%
        RunPCA(verbose = FALSE, assay="m_SCT", reduction.name="m_pca") %>%
        RunUMAP(dims = 1:30, verbose = FALSE, reduction = "m_pca", reduction.name = "m_umap") %>%
        FindNeighbors(dims = 1:30, verbose = FALSE, reduction = "m_pca", graph.name = "m_snn") %>%
        FindClusters(verbose = FALSE, graph.name="m_snn", cluster.name="mito_regressed_cluster")
    )
}

In [None]:
sc_list <- lapply(X=sc_list, FUN=standard_seurat_analysis)

In [None]:
sc_list <- lapply(X=sc_list, FUN=regress_mito_seurat_analysis)

In [None]:
sc_list[[2]][[]]

In [None]:
dimplots <- function(sc_list){
    plots = list()
        for (name in names(sc_list)){
            x <- sc_list[[name]]
            Idents(object = x) <- "annotation"
            for (reduction in c("umap","m_umap")){
                if (reduction == "umap"){
                    negation = "not"
                } else {
                    negation = ""
                }
                title <- paste0("Clustering of sample ",name,'\n',negation," controlling for mitochondrial read fraction")
                plots[[title]] <- DimPlot(x, reduction=reduction, label=TRUE, repel=TRUE) + 
                    ggtitle(title) + 
                    theme(legend.position = "bottom")
            }
            if (!F){ # don't generate this plot because it's uninformative but keep the code in case it's useful
            frequency_table <- x[[]] %>%
                group_by(seurat_clusters, annotation, mito_regressed_cluster) %>%
                summarise(count = n(), .groups = 'drop')
            plots[[paste0(name," alluvial")]] <- ggplot(frequency_table,
                    aes(axis1 = seurat_clusters,
                        axis2 = annotation,
                        axis3 = mito_regressed_cluster,
                        y = count)) +
                geom_flow(aes(fill = annotation),decreasing=FALSE) +
                geom_stratum(decreasing=FALSE) +
                geom_text(stat = "stratum", decreasing=FALSE,
                    aes(label = after_stat(stratum)))
            }
        }
    options(repr.plot.width = 20, repr.plot.height = 16)
    combined_plot <- wrap_plots(plots) + plot_layout(ncol = 3)
    print(combined_plot)
    return()
}
dimplots(sc_list)

In [None]:
sc_list <- lapply(X=sc_list, FUN=function(x){
    DefaultAssay(x) <- "m_SCT"
    return(x)
})

In [None]:
options(repr.plot.width = 20, repr.plot.height = 20)
ncol=5
features=c("Gli1","Gli2","Mki67","Ctnnb1","Birc5","Ntrk3",
           "Trp53","Pten","Tll1","Tex14",
           "Rbfox3","Cntn2","Neurod1","Cacna1e","Kcnk1","Grin2b","Cntnap4","Samd12","Zmat4",
           "Prom1","Myc","Scrt2")
FeaturePlot(sc_list[[1]], slot='data', features=features,ncol=ncol, label=TRUE)
FeaturePlot(sc_list[[2]], slot='data', features=features,ncol=ncol, label=TRUE)
FeaturePlot(sc_list[[3]], slot='data', features=features,ncol=ncol, label=TRUE)

In [None]:
saveRDS(sc_list, "out/seuratobject_list.rds")

In [None]:
# Read this list of seuratobjects with readRDS
sc_list <- readRDS("out/seuratobject_list.rds")

# Dead code

In [None]:
# Signac hangs on CreateChromatinAssay during "computing hash" step.
# possible causes:
#  Apple silicon architecture?
#  Just requires a ton of resources?
#  May need to load multiome data on shirokane, or use a different tool.

library(Signac)
library(BSgenome.Mmusculus.UCSC.mm10)
library(EnsDb.Mmusculus.v79)

ANNOTATION <- GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79)
seqlevelsStyle(ANNOTATION) <- 'UCSC'
genome(ANNOTATION) <- "mm10"

print('loading ATAC data...')
atac_counts <- data$Peaks
grange.counts <- StringToGRanges(rownames(atac_counts), sep = c(":", "-"))
grange.use <- seqnames(grange.counts) %in% standardChromosomes(grange.counts)
atac_counts <- atac_counts[as.vector(grange.use), ]
obj[["ATAC"]] <- CreateChromatinAssay(
    counts = atac_counts,
    sep = c(":", "-"),
    genome = 'mm10',
    fragments = DATA_LOCATIONS[[experiment]][2],
    min.cells = 10,
    min.features = 100,
    annotation = ANNOTATION
)