# RPCA
Preprocessing done in notebook `preprocessing.ipynb`. 
tutorial at https://satijalab.org/seurat/articles/integration_rpca.html  

## Results
Output figure in `figures/rpca_integration.png`.  
I think a few things are apparent from this plot:
- **(a-b)** Integration at the cell level seems to have worked well. In subplots a-b, we see that cells cluster by cell type rather than sample origin.
- **(d-g)** 2 clusters of proliferating cells express *Mki67*. *Neurod1* expression is ubiquitous, but somewhat stronger in the differentiated cells, such that we see weak anticorrelation with *Mki67*.
- There seems to be a bug in `VlnPlot` when `add.noise==TRUE` which deletes area of plot when it contains lots of zeroes.
- **(h-k)** Comparison of *Neurod1* expresison level seems to be heavily influenced by batch. If we were to do a statistical test of *Neurod1* expression in these data, we would probably see that tumor > pnc > gnp. However, there are several problems with such an approach:
  - Batch correction adds a small error *e* to all tumor *Neurod1* expression levels, and subtracts a small error *f* from gnp. Thus, all cells with zero *Neurod1* counts in tumor will have greater 'expression' than cells with zero counts in pnc. Thus, I think the result may be more reflective of sequencing depth or sample quality than biological *Neurod1* variation.
  - M-W is invalid with lots of ties, which is true for pnc. Would need to use K-S or something.
  - RPCA seems to have mapped all cells into pnc space. This means that transformation was only performed on gnp and tumor cells. Ironically, this may introduce a batch effect like point 1 above where an error-prone transformation was performed in one sample but not the other.

## Conclusion
RPCA is not appropriate to compare expression of an individual gene across samples.

In [None]:
Sys.setenv(LANGUAGE = "en") # set language to "ja" if you prefer

library(Seurat)
library(tidyverse)
library(patchwork)
#library(future)

set.seed(47);

In [None]:
# Set parallel execution settings
#future::plan("multisession", workers = as.integer(availableCores()/2), gc=TRUE)
#options(future.globals.maxSize = 1024*32*1024^2) # Set max variable size to 8Gb
#options(future.globals.onReference = "error")

In [None]:
sc_list <- readRDS("out/seuratobject_list.rds")

In [None]:
# RPCA 
# notes:
# Seurat v5 has a one-step IntegrateLayers function, but it can't output transformed features, only an embedding. 
# Therefore we use the old workflow from v4.
# See https://satijalab.org/seurat/articles/seurat5_integration, https://github.com/satijalab/seurat/issues/8653
# futures multisession processing fails on IntegrateData.

# select features that are repeatedly variable across datasets for integration run PCA on each
# dataset using these features
features <- SelectIntegrationFeatures(object.list=sc_list, assay=rep('m_SCT',3))
sc_list <- lapply(X = sc_list, FUN = function(x) {
    x <- ScaleData(x, features = features, verbose = FALSE)
    x <- RunPCA(x, features = features, verbose = FALSE)
})
sc_list <- PrepSCTIntegration(sc_list,assay='m_SCT',anchor.features=features)
anchors <- FindIntegrationAnchors(object.list=sc_list, anchor.features=features, scale=FALSE, reduction = "rpca", normalization.method='SCT')

In [None]:
Sys.setenv(R_MAX_VSIZE = 32e9)

In [None]:
# Trying to integrate all genes runs into OOM errors
# shared_features <- Reduce(intersect, lapply(sc_list, Features))
'Neurod1' %in% features # need gene of interest to be in features
combined_data <- IntegrateData(anchorset = anchors, features.to.integrate = features, normalization.method = 'SCT')

In [None]:
combined_data[[]] %>% head

In [None]:
# specify that we will perform downstream analysis on the corrected data note that the
# original unmodified data still resides in the 'RNA' assay
DefaultAssay(combined_data) <- "integrated"

# Run the standard workflow for visualization and clustering
combined_data <- combined_data %>%
    ScaleData(verbose = FALSE) %>%
    RunPCA(npcs = 30, verbose = FALSE) %>%
    RunUMAP(reduction = "pca", dims = 1:30) %>%
    FindNeighbors(reduction = "pca", dims = 1:30) %>%
    FindClusters(resolution = 0.5)

In [None]:
# Visualization
options(repr.plot.width = 20, repr.plot.height = 12)
p1 <- DimPlot(combined_data, reduction = "umap", group.by = "orig.ident",shuffle=TRUE)
p2 <- DimPlot(combined_data, reduction = "umap", group.by = "annotation", label = TRUE,shuffle=TRUE,repel = TRUE)

seurat_subset <- subset(combined_data, subset = annotation %in% c('ProliferativeCells', 'DifferentiatedCells', NA))

p3 <- DimPlot(combined_data, reduction = 'umap', group.by = 'seurat_clusters', label = TRUE, shuffle=TRUE, repel = TRUE)
p4 <- FeaturePlot(seurat_subset,features = c('Neurod1','Mki67'), order = TRUE, blend=TRUE)
p5 <- VlnPlot(seurat_subset, features = c('Neurod1'), group.by = 'orig.ident', layer = 'scale.data',alpha=0.2)

In [None]:
p6 <- VlnPlot(subset(sc_list[[1]], subset = annotation %in% c('ProliferativeCells', 'DifferentiatedCells')), 
              features = c('Neurod1'), group.by = 'orig.ident', layer = 'scale.data', add.noise = FALSE) + theme(legend.position = 'none')
p7 <- VlnPlot(subset(sc_list[[2]], subset = annotation %in% c('ProliferativeCells', 'DifferentiatedCells')), 
              features = c('Neurod1'), group.by = 'orig.ident', layer = 'scale.data', add.noise = FALSE) + theme(legend.position = 'none')
p8 <- VlnPlot(subset(sc_list[[3]], subset = annotation %in% c('ProliferativeCells', 'DifferentiatedCells')), 
              features = c('Neurod1'), group.by = 'orig.ident', layer = 'scale.data', add.noise = FALSE) + theme(legend.position = 'none')

In [None]:
options(repr.plot.width = 20, repr.plot.height = 18)
#(p1 | p2 | p3 )
#(p4 | p5) +
#  plot_layout(widths = c(1, 2))
p <- (( p1 | p2 | p3 ) /
(p4 + plot_layout(widths = c(3,3,3,1))) /
( p5 | p6 | p7 | p8 )) + plot_annotation(tag_levels = 'a')
p
ggsave('figures/rpca_integration.png', p, scale=2)

In [None]:
# bug in VlnPlot when add.noise==TRUE; seems to delete area of plot when it contains lots of zeroes.
options(repr.plot.width = 20, repr.plot.height = 6)
p8.2 <- VlnPlot(subset(sc_list[[3]], subset = annotation %in% c('ProliferativeCells', 'DifferentiatedCells')), 
              features = c('Neurod1'), group.by = 'orig.ident', layer = 'scale.data')
p7.2 <- VlnPlot(subset(sc_list[[2]], subset = annotation %in% c('ProliferativeCells', 'DifferentiatedCells')), 
              features = c('Neurod1'), group.by = 'orig.ident', layer = 'scale.data')
p7 | p7.2 | p8 | p8.2