-
Notifications
You must be signed in to change notification settings - Fork 0
/
clustering_scRNA.Rmd
168 lines (135 loc) · 7.38 KB
/
clustering_scRNA.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
---
title: "Integration and clustering of scRNA sequencing data"
author: "Anthony Hung"
date: "2021-01-19"
output: workflowr::wflow_html
editor_options:
chunk_output_type: console
---
# Introduction
To determine cell type heterogeneity between samples jointly, this code treats each individual (unstrained condition) as its own separate sample and performs integration across individuals. It then finds clusters within the integrated data and characterizes the clusters.
# Load data and packages
The ANT1.2 seurat object was generated by running the preprocessing code in [Pre-processing of raw 10x files into count matrices and demultiplexing](preProcess_scRNA.html).
```{r load}
library(Seurat)
library(gridExtra)
library(ggplot2)
ANT1.2 <- readRDS("data/ANT1_2.rds")
```
# Split data into individuals
First, we split the merged seurat object into separate objects corresponding to the individual from which cells originated. Then integrate across the individuals using SCT transform
```{r split single cell data}
#split data into each individual/sample
NA18855_Unstrain <- subset(ANT1.2, labels == "NA18855_Unstrain")
NA18856_Unstrain <- subset(ANT1.2, labels == "NA18856_Unstrain")
NA19160_Unstrain <- subset(ANT1.2, labels == "NA19160_Unstrain")
# Create a seurat object merged for all the unstrain samples (because we are missing one individual for the strained samples)
seurat.list <- list(NA18855_Unstrain, NA18856_Unstrain, NA19160_Unstrain)
for (i in 1:length(seurat.list)) {
seurat.list[[i]] <- SCTransform(seurat.list[[i]], verbose = FALSE)
}
SCT.features <- SelectIntegrationFeatures(object.list = seurat.list, nfeatures = 5000)
seurat.list <- PrepSCTIntegration(object.list = seurat.list, anchor.features = SCT.features,
verbose = FALSE)
#find anchors
seurat.anchors <- FindIntegrationAnchors(object.list = seurat.list, normalization.method = "SCT",
anchor.features = SCT.features, verbose = FALSE)
SCT.integrated <- IntegrateData(anchorset = seurat.anchors, normalization.method = "SCT",
verbose = FALSE)
#visualized integrated
SCT.integrated <- RunPCA(SCT.integrated, verbose = FALSE, npcs = 100)
DimPlot(SCT.integrated, reduction = "pca", group.by = c("labels"))
ElbowPlot(SCT.integrated, ndims = 100) #38 PCs?
SCT.integrated <- FindNeighbors(SCT.integrated, dims = 1:38)
SCT.integrated <- FindClusters(SCT.integrated, resolution = 0.4)
DimPlot(SCT.integrated, group.by = c("labels"), reduction = "pca")
DimPlot(SCT.integrated, group.by = c("seurat_clusters"), reduction = "pca")
SCT.integrated <- RunUMAP(SCT.integrated, dims = 1:38)
p1_SCT <- DimPlot(SCT.integrated, group.by = c("orig.ident"))
p2_SCT <- DimPlot(SCT.integrated, group.by = c("labels"))
p3_SCT <- DimPlot(SCT.integrated, group.by = c("seurat_clusters"))
grid.arrange(p1_SCT, p2_SCT, p3_SCT, nrow = 2)
#for figures: remove legend and add in later
p1_figure <- p2_SCT + theme(legend.position = "none")
p2_figure <- p3_SCT + theme(legend.position = "none") + scale_color_manual(values=c("plum3", "#E69F00", "#654321"))
grid.arrange(p1_figure, p2_figure)
p2_SCT
p3_SCT + scale_color_manual(values=c("plum3", "#E69F00", "#654321"))
SCT.integrated@active.ident <- SCT.integrated$integrated_snn_res.0.4
```
# Examine clusters in integrated data
We identified 3 clusters in the integrated data. We now characterize them by gene expression for a chondrogenic marker gene to get a sense of which cluster(s) represent more mature chondrocytes.
```{r examine clusters}
cluster_memberships <- as.matrix(table(SCT.integrated@meta.data$integrated_snn_res.0.4, SCT.integrated@meta.data$labels))
cluster_memberships
cluster_memberships_proportions <- prop.table(cluster_memberships, margin = 2)
cluster_memberships_proportions
#make a nice proportions bar plot for a figure
library(cowplot)
long_cluster_memberships_proportions <- as_tibble(as.data.frame(cluster_memberships_proportions))
ggplot(long_cluster_memberships_proportions, aes(x = Var2, y = Freq, fill = Var1)) +
geom_bar(position = "dodge", stat = "identity") +
scale_fill_manual(values=c("plum3", "#E69F00", "#654321")) +
theme_cowplot() +
labs(title = "Proportion of cells from each individual \n assigned to each unsupervised cluster", x = "", y = "Proportion", fill = "Cluster")
```
# Explore some cluster marker genes
Here, we fit a poisson ash model to raw counts from each of the three clusters for a few MSC and chondrocyte marker genes and plot their distributions broken down by cluster and by individual. It looks like cluster 2 has higher expression of chondrocyte markers, while more or less all three clusters have similar expression of MSC markers.
```{r poisson ash}
#run poisson ash to data per cluster and per indivdiual, then show the estimated prior distribution
library(ashr)
library(Matrix)
#plot curves for genes separated by indivdual (information for indivdual contained in labels)
plot_by_labels <- function(gene, counts, labels){
#create dataframe to store the cdf axes and the groups
dataframe_cdf <- data.frame(x = c(), y = c(), label = c())
for(individual in unique(labels)){ #in the case that the labels supplied contain cluster, it will do this by cluster instead
x <- counts[gene, labels == individual]
s <- colSums(counts[,labels == individual]) #sum of molecule counts per sample
lam <- x/s
fit <- ashr::ash_pois(x, s, mixcompdist = "halfuniform")
cdf <- ashr::cdf.ash(fit, seq(from = 0, to = max(lam), length.out = 1000))
temp_df <- cbind(cdf$x, t(cdf$y), individual)
names(temp_df) <- c("x", "y", "label")
dataframe_cdf <- rbind(dataframe_cdf, temp_df)
}
names(dataframe_cdf) <- c("x", "y", "label")
dataframe_cdf$x <- as.character(dataframe_cdf$x)
dataframe_cdf$y <- as.character(dataframe_cdf$y)
dataframe_cdf$x <- as.numeric(dataframe_cdf$x)
dataframe_cdf$y <- as.numeric(dataframe_cdf$y)
#use df to make our plot
plot <- ggplot(dataframe_cdf, aes(x = x, y = y, color = as.factor(label), group=as.factor(label))) +
geom_point() +
labs(title = paste0(gene), x = "Latent gene expression", y = "CDF", color = "Label") +
theme_cowplot()
return(plot)
}
features.plot.chond <- c("TIMP3", "TIMP2", "COL11A1",
"SOX5", "SOX6",
"TGFBI")
features.plot.msc <- c("THY1", "NT5E")
#by cluster
for (marker in c(features.plot.msc, features.plot.chond)){
print(plot_by_labels(marker, SCT.integrated@assays$RNA@counts, SCT.integrated$integrated_snn_res.0.4) + labs(color = "Cluster"))
}
#by individual
for (marker in c(features.plot.msc, features.plot.chond)){
print(plot_by_labels(marker, SCT.integrated@assays$RNA@counts, SCT.integrated$labels) + labs(color = "Individual"))
}
#plots for figure
a <- plot_by_labels("COL11A1", SCT.integrated@assays$RNA@counts, SCT.integrated$integrated_snn_res.0.4) +
theme(legend.position = "none") +
scale_color_manual(values=c("plum3", "#E69F00", "#654321"))
b <- plot_by_labels("COL11A1", SCT.integrated@assays$RNA@counts, SCT.integrated$labels) +
theme(legend.position = "none")
grid.arrange(a, b, nrow = 1)
a<- c("MMP2", "MMP14")
for (marker in a){
print(plot_by_labels(marker, SCT.integrated@assays$RNA@counts, SCT.integrated$integrated_snn_res.0.4) + labs(color = "Cluster"))
}
#by individual
for (marker in a){
print(plot_by_labels(marker, SCT.integrated@assays$RNA@counts, SCT.integrated$labels) + labs(color = "Individual"))
}
```