analysis/clustering_scRNA.Rmd

---
title: "Integration and clustering of scRNA sequencing data"
author: "Anthony Hung"
date: "2021-01-19"
output: workflowr::wflow_html
editor_options:
  chunk_output_type: console
---

# Introduction

To determine cell type heterogeneity between samples jointly, this code treats each individual (unstrained condition) as its own separate sample and performs integration across individuals. It then finds clusters within the integrated data and characterizes the clusters.

# Load data and packages

The ANT1.2 seurat object was generated by running the preprocessing code in [Pre-processing of raw 10x files into count matrices and demultiplexing](preProcess_scRNA.html).

```{r load}
library(Seurat)
library(gridExtra)
library(ggplot2)

ANT1.2 <- readRDS("data/ANT1_2.rds")
```

# Split data into individuals

First, we split the merged seurat object into separate objects corresponding to the individual from which cells originated. Then integrate across the individuals using SCT transform

```{r split single cell data}
#split data into each individual/sample
NA18855_Unstrain <- subset(ANT1.2, labels == "NA18855_Unstrain")
NA18856_Unstrain <- subset(ANT1.2, labels == "NA18856_Unstrain")
NA19160_Unstrain <- subset(ANT1.2, labels == "NA19160_Unstrain")

# Create a seurat object merged for all the unstrain samples (because we are missing one individual for the strained samples)
seurat.list <- list(NA18855_Unstrain, NA18856_Unstrain, NA19160_Unstrain)
for (i in 1:length(seurat.list)) {
    seurat.list[[i]] <- SCTransform(seurat.list[[i]], verbose = FALSE)
}

SCT.features <- SelectIntegrationFeatures(object.list = seurat.list, nfeatures = 5000)
seurat.list <- PrepSCTIntegration(object.list = seurat.list, anchor.features = SCT.features, 
    verbose = FALSE)

#find anchors
seurat.anchors <- FindIntegrationAnchors(object.list = seurat.list, normalization.method = "SCT", 
    anchor.features = SCT.features, verbose = FALSE)
SCT.integrated <- IntegrateData(anchorset = seurat.anchors, normalization.method = "SCT",
    verbose = FALSE)

#visualized integrated
SCT.integrated <- RunPCA(SCT.integrated, verbose = FALSE, npcs = 100)
DimPlot(SCT.integrated, reduction = "pca", group.by = c("labels"))
ElbowPlot(SCT.integrated, ndims = 100) #38 PCs?
SCT.integrated <- FindNeighbors(SCT.integrated, dims = 1:38)
SCT.integrated <- FindClusters(SCT.integrated, resolution = 0.4)
DimPlot(SCT.integrated, group.by = c("labels"), reduction = "pca")
DimPlot(SCT.integrated, group.by = c("seurat_clusters"), reduction = "pca")

SCT.integrated <- RunUMAP(SCT.integrated, dims = 1:38)

p1_SCT <- DimPlot(SCT.integrated, group.by = c("orig.ident"))
p2_SCT <- DimPlot(SCT.integrated, group.by = c("labels"))
p3_SCT <- DimPlot(SCT.integrated, group.by = c("seurat_clusters"))
grid.arrange(p1_SCT, p2_SCT, p3_SCT, nrow = 2)

#for figures: remove legend and add in later
p1_figure <- p2_SCT + theme(legend.position = "none")
p2_figure <- p3_SCT + theme(legend.position = "none") + scale_color_manual(values=c("plum3", "#E69F00", "#654321"))
grid.arrange(p1_figure, p2_figure)

p2_SCT 
p3_SCT + scale_color_manual(values=c("plum3", "#E69F00", "#654321"))

SCT.integrated@active.ident <- SCT.integrated$integrated_snn_res.0.4
```

# Examine clusters in integrated data

We identified 3 clusters in the integrated data. We now characterize them by gene expression for a chondrogenic marker gene to get a sense of which cluster(s) represent more mature chondrocytes.

```{r examine clusters}
cluster_memberships <- as.matrix(table(SCT.integrated@meta.data$integrated_snn_res.0.4, SCT.integrated@meta.data$labels))
cluster_memberships

cluster_memberships_proportions <- prop.table(cluster_memberships, margin = 2)
cluster_memberships_proportions

#make a nice proportions bar plot for a figure
library(cowplot)
long_cluster_memberships_proportions <- as_tibble(as.data.frame(cluster_memberships_proportions))
ggplot(long_cluster_memberships_proportions, aes(x = Var2, y = Freq, fill = Var1)) +
     geom_bar(position = "dodge", stat = "identity") + 
     scale_fill_manual(values=c("plum3", "#E69F00", "#654321")) + 
     theme_cowplot() +
     labs(title = "Proportion of cells from each individual \n assigned to each unsupervised cluster", x = "", y = "Proportion", fill = "Cluster")
```

# Explore some cluster marker genes

Here, we fit a poisson ash model to raw counts from each of the three clusters for a few MSC and chondrocyte marker genes and plot their distributions broken down by cluster and by individual. It looks like cluster 2 has higher expression of chondrocyte markers, while more or less all three clusters have similar expression of MSC markers.

```{r poisson ash}
#run poisson ash to data per cluster and per indivdiual, then show the estimated prior distribution
library(ashr)
library(Matrix)
#plot curves for genes separated by indivdual (information for indivdual contained in labels)
plot_by_labels <- function(gene, counts, labels){
     #create dataframe to store the cdf axes and the groups
     dataframe_cdf <- data.frame(x = c(), y = c(), label = c())
     for(individual in unique(labels)){ #in the case that the labels supplied contain cluster, it will do this by cluster instead
          x <- counts[gene, labels == individual]
          s <- colSums(counts[,labels == individual]) #sum of molecule counts per sample
          lam <- x/s
          fit <- ashr::ash_pois(x, s, mixcompdist = "halfuniform")
          cdf <- ashr::cdf.ash(fit, seq(from = 0, to = max(lam), length.out = 1000))
          temp_df <- cbind(cdf$x, t(cdf$y), individual)
          names(temp_df) <- c("x", "y", "label")
          dataframe_cdf <- rbind(dataframe_cdf, temp_df)
     }
     names(dataframe_cdf) <- c("x", "y", "label")
     dataframe_cdf$x <- as.character(dataframe_cdf$x)
     dataframe_cdf$y <- as.character(dataframe_cdf$y)
     dataframe_cdf$x <- as.numeric(dataframe_cdf$x)
     dataframe_cdf$y <- as.numeric(dataframe_cdf$y)
     #use df to make our plot
     plot <- ggplot(dataframe_cdf, aes(x = x, y = y, color = as.factor(label), group=as.factor(label))) +
          geom_point() + 
          labs(title = paste0(gene), x = "Latent gene expression", y = "CDF", color = "Label") +
          theme_cowplot()
     return(plot)
}

features.plot.chond <- c("TIMP3", "TIMP2", "COL11A1", 
                   "SOX5", "SOX6",
                   "TGFBI")
features.plot.msc <- c("THY1",  "NT5E")

#by cluster
for (marker in c(features.plot.msc, features.plot.chond)){
     print(plot_by_labels(marker, SCT.integrated@assays$RNA@counts, SCT.integrated$integrated_snn_res.0.4) + labs(color = "Cluster"))
}

#by individual
for (marker in c(features.plot.msc, features.plot.chond)){
     print(plot_by_labels(marker, SCT.integrated@assays$RNA@counts, SCT.integrated$labels) + labs(color = "Individual"))
}

#plots for figure
a <- plot_by_labels("COL11A1", SCT.integrated@assays$RNA@counts, SCT.integrated$integrated_snn_res.0.4) + 
     theme(legend.position = "none") + 
     scale_color_manual(values=c("plum3", "#E69F00", "#654321"))
b <- plot_by_labels("COL11A1", SCT.integrated@assays$RNA@counts, SCT.integrated$labels) + 
     theme(legend.position = "none") 
grid.arrange(a, b, nrow = 1)

a<- c("MMP2", "MMP14")

for (marker in a){
     print(plot_by_labels(marker, SCT.integrated@assays$RNA@counts, SCT.integrated$integrated_snn_res.0.4) + labs(color = "Cluster"))
}

#by individual
for (marker in a){
     print(plot_by_labels(marker, SCT.integrated@assays$RNA@counts, SCT.integrated$labels) + labs(color = "Individual"))
}
```