## B01: 必要なパッケージを読み込む

In [None]:
library(Seurat)
library(tidyverse)

## B02: シードの固定

In [None]:
set.seed(1234)

## B03: データ（遺伝子発現量行列）を読み込む

In [None]:
# （ちょっと時間がかかります）
df_counts <- read_tsv("data/GSM3173562_Lakshmipuram_NCBI_processeddata.txt", col_names=TRUE)

## B04: 読み込んだデータの形を確認する

In [None]:
# 中身の確認
df_counts

In [None]:
# 行数
nrow(df_counts)

# 列数
ncol(df_counts)

## B05: データを Seurat の形に変換する

In [None]:
# データフレームを行列に変換する
df_counts %>% 
select(-GENE) %>% 
as.matrix() ->
mat_counts

# データフレーム df_counts の GENE という列の内容を行列 mat_counts の行の名前に利用する
rownames(mat_counts) <- df_counts$GENE

In [None]:
# 作った行列の構造を確認
str(mat_counts)

In [None]:
# Seurat オブジェクトに変換する
planarian <-  CreateSeuratObject(counts = mat_counts, project = "planarian_2k")

In [None]:
# 作った Seurat オブジェクト (planarian) を確認する
planarian

## B06: 品質の低い細胞をフィルターする

In [None]:
VlnPlot(planarian, features = c("nFeature_RNA", "nCount_RNA"), ncol = 2)

In [None]:
FeatureScatter(planarian, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

In [None]:
planarian <- subset(planarian, subset = nFeature_RNA >= 200 & nCount_RNA >= 500)

In [None]:
planarian

In [None]:
FeatureScatter(planarian, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

## B07: 発現量データを正規化する

In [None]:
planarian <- NormalizeData(planarian, normalization.method = "LogNormalize", scale.factor = 10000)

## B08: 高変動遺伝子（highly variabe genes) を抽出する

In [None]:
planarian <- FindVariableFeatures(planarian, selection.method = "vst", nfeatures = 2000)

In [None]:
top10 <- head(VariableFeatures(planarian), 10)

In [None]:
plot1 <- VariableFeaturePlot(planarian)
plot1
LabelPoints(plot = plot1, points = top10, repel = TRUE)

## B09: データをスケーリングする



In [None]:
# 少し時間がかかります
all.genes <- rownames(planarian)
planarian <- ScaleData(planarian, features = all.genes)

## B10: PCA（主成分分析）を用いて次元削減を行う

In [None]:
planarian <- RunPCA(planarian, features = VariableFeatures(object = planarian))

In [None]:
print(planarian[["pca"]], dims = 1:5, nfeatures = 5)

In [None]:
VizDimLoadings(planarian, dims = 1:2, reduction = "pca")

In [None]:
DimPlot(planarian, reduction = "pca")

In [None]:
DimHeatmap(planarian, dims = 1:9, cells = 500, balanced = TRUE)


## B11: データの有効な次元数を調べる


In [None]:
# 少し時間がかかります
planarian <- JackStraw(planarian, num.replicate = 100)
planarian <- ScoreJackStraw(planarian, dims = 1:20)

In [None]:
JackStrawPlot(planarian, dims = 1:15)

In [None]:
ElbowPlot(planarian)

## B12: 細胞をクラスタリングする

In [None]:
planarian <- FindNeighbors(planarian, dims = 1:10)
planarian <- FindClusters(planarian, resolution = 0.5)

In [None]:
head(Idents(planarian), 10)

head(planarian$seurat_clusters, 10)

In [None]:
as_tibble(Idents(planarian), rownames = "cell_barcode") %>% 
    head

In [None]:
as_tibble(Idents(planarian), rownames = "cell_barcode") %>% 
    group_by(value) %>%
    summarise(n_cell = n())

## B13: PCAの結果にさらにUMAPをかけて２次元空間に射影する

In [None]:
planarian <- RunUMAP(planarian, dims = 1:10)


In [None]:
DimPlot(planarian, reduction = "umap")


## B14: 各クラスターに特徴的な遺伝子群を探す

In [None]:
# find all markers of cluster 1
cluster1_markers <- FindMarkers(planarian, ident.1 = 2, min.pct = 0.25)
head(cluster1_markers, n = 5)

In [None]:
# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5_markers <- FindMarkers(planarian, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
head(cluster5_markers, n = 5)

In [None]:
# find markers for every cluster compared to all remaining cells, report only the positive ones
planarian_markers <- FindAllMarkers(planarian, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

In [None]:
planarian_markers %>% 
    group_by(cluster) %>% 
    top_n(n = 2, wt = avg_log2FC)

## B15: クラスターごとに遺伝子発現量をプロットする

In [None]:
VlnPlot(planarian, features = c("Smed-prog-2a-SmedASXL-014068-BPKG56961", "SmedASXL-008653"))


In [None]:
top10 <- planarian_markers %>% 
                    group_by(cluster) %>% 
                    top_n(n = 5, wt = avg_log2FC)

options(repr.plot.width=15, repr.plot.height=10)

DoHeatmap(planarian, features = top10$gene) + NoLegend()

## B16: 遺伝子発現量をUMAPの図に重ねる

In [None]:
planarian_markers %>% 
    group_by(cluster) %>% 
    top_n(n = 1, wt = avg_log2FC) %>%
    .$gene -> each_cluster_features

each_cluster_features

In [None]:
options(repr.plot.width=4, repr.plot.height=3)

DimPlot(planarian, reduction = "umap")

In [None]:
options(repr.plot.width=15, repr.plot.height=10)

FeaturePlot(planarian, features = each_cluster_features)

# 発展



以下のコマンドをターミナルで打つと、ヒトとプラナリアのオーソログ関係の表（一部）がダウンロードできる

```
$ cd data
$ wget https://raw.githubusercontent.com/bioinfo-tsukuba/20201226-EB62104-bioinformatics-course/master/human_ortholog_subset.tsv
```

各クラスターに特徴的な遺伝子から、各クラスターの細胞の機能を類推せよ

joinを使うとよい

## B21: ヒトとプラナリアのオーソログ関係の表を読み込む

In [None]:
human_ortholog = read_tsv("data/human_ortholog_subset.tsv", col_names=TRUE)

In [None]:
str(human_ortholog)

In [None]:
human_ortholog %>%
    mutate(Smed_Code2 = gsub("_", "-", Smed_Code)) -> human_ortholog

## B22: 各クラスターに特徴的な遺伝子群のヒトオーソログを抽出する

In [None]:
planarian_markers %>% head

In [None]:
planarian_markers %>%
    filter(cluster == 0) %>%
    inner_join(human_ortholog, by=c("gene" = "Smed_Code2"))