diff --git a/.Rprofile b/.Rprofile new file mode 100644 index 0000000..f3d72d2 --- /dev/null +++ b/.Rprofile @@ -0,0 +1 @@ +source("renv/activate.R") \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b5a0272 --- /dev/null +++ b/.gitignore @@ -0,0 +1,57 @@ +# renv files +renv/ +!renv/settings.dcf +!renv/activate.R + +# R files +.Rproj.user/ +.Rhistory +.RData +.Ruserdata +*.Rproj + +# Output files +outputs/*.png +outputs/*.pdf +outputs/*.csv +outputs/*.rds +outputs/*.html + +# Data files (uncomment as needed) +# data/*.csv +# data/*.tsv +# data/*.h5 +# data/*.gz +# data/*.rds + +# Temporary files +*.tmp +*.temp +*~ + +# System files +.DS_Store +Thumbs.db + +# IDE files +.vscode/ +.idea/ + +# Log files +*.log + +# Compiled R files +*.so +*.dll +*.dylib + +# Package build files +*.tar.gz +*.zip + +# HTML files (unless specifically needed) +*.html + +# Cache directories +cache/ +.cache/ \ No newline at end of file diff --git a/R/analysis_functions.R b/R/analysis_functions.R new file mode 100644 index 0000000..2e19cb9 --- /dev/null +++ b/R/analysis_functions.R @@ -0,0 +1,109 @@ +# Basic single-cell RNA-seq analysis functions +# Author: CC1001-CTRL +# Date: 2024 + +#' Load and preprocess single-cell data +#' +#' @param data_path Path to the data file +#' @param project_name Name for the Seurat object +#' @return A preprocessed Seurat object +#' @export +load_and_preprocess <- function(data_path, project_name = "scRNA_analysis") { + library(Seurat) + library(dplyr) + + # Load data (adjust based on your data format) + # This is a template - modify for your specific data format + if (file.exists(data_path)) { + data <- Read10X(data_path) + seurat_obj <- CreateSeuratObject(counts = data, project = project_name) + } else { + # Use example data if file doesn't exist + data("pbmc_small") + seurat_obj <- pbmc_small + message("Using example pbmc_small dataset") + } + + # Basic preprocessing + seurat_obj[["percent.mt"]] <- PercentageFeatureSet(seurat_obj, pattern = "^MT-") + + # Filter cells and features + seurat_obj <- subset(seurat_obj, + subset = nFeature_RNA > 200 & + nFeature_RNA < 5000 & + percent.mt < 20) + + return(seurat_obj) +} + +#' Perform standard normalization and scaling +#' +#' @param seurat_obj A Seurat object +#' @param nfeatures Number of variable features to find +#' @return A normalized and scaled Seurat object +#' @export +normalize_and_scale <- function(seurat_obj, nfeatures = 2000) { + library(Seurat) + + # Normalize data + seurat_obj <- NormalizeData(seurat_obj, normalization.method = "LogNormalize", + scale.factor = 10000) + + # Find variable features + seurat_obj <- FindVariableFeatures(seurat_obj, selection.method = "vst", + nfeatures = nfeatures) + + # Scale data + all.genes <- rownames(seurat_obj) + seurat_obj <- ScaleData(seurat_obj, features = all.genes) + + return(seurat_obj) +} + +#' Perform dimensional reduction +#' +#' @param seurat_obj A Seurat object +#' @param npcs Number of principal components to compute +#' @return A Seurat object with PCA and UMAP +#' @export +run_dimensional_reduction <- function(seurat_obj, npcs = 50) { + library(Seurat) + + # Run PCA + seurat_obj <- RunPCA(seurat_obj, features = VariableFeatures(object = seurat_obj), + npcs = npcs) + + # Find neighbors and clusters + seurat_obj <- FindNeighbors(seurat_obj, dims = 1:20) + seurat_obj <- FindClusters(seurat_obj, resolution = 0.5) + + # Run UMAP + seurat_obj <- RunUMAP(seurat_obj, dims = 1:20) + + return(seurat_obj) +} + +#' Create quality control plots +#' +#' @param seurat_obj A Seurat object +#' @return A list of ggplot objects +#' @export +create_qc_plots <- function(seurat_obj) { + library(ggplot2) + library(Seurat) + + plots <- list() + + # Violin plot for QC metrics + plots$violin <- VlnPlot(seurat_obj, + features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), + ncol = 3) + + # Feature scatter plots + plots$feature_scatter1 <- FeatureScatter(seurat_obj, feature1 = "nCount_RNA", + feature2 = "percent.mt") + plots$feature_scatter2 <- FeatureScatter(seurat_obj, feature1 = "nCount_RNA", + feature2 = "nFeature_RNA") + + return(plots) +} \ No newline at end of file diff --git a/R/example_analysis.R b/R/example_analysis.R new file mode 100644 index 0000000..eb8def6 --- /dev/null +++ b/R/example_analysis.R @@ -0,0 +1,91 @@ +# Example single-cell RNA-seq analysis workflow +# This script demonstrates a complete analysis pipeline + +# Load required libraries +library(Seurat) +library(ggplot2) +library(dplyr) +library(patchwork) + +# Source custom functions +source("R/analysis_functions.R") + +# Set seed for reproducibility +set.seed(42) + +# 1. Load and preprocess data +message("Loading and preprocessing data...") +seurat_obj <- load_and_preprocess("data/") # Will use example data if path doesn't exist + +# 2. Normalize and scale data +message("Normalizing and scaling data...") +seurat_obj <- normalize_and_scale(seurat_obj, nfeatures = 2000) + +# 3. Run dimensional reduction +message("Running dimensional reduction...") +seurat_obj <- run_dimensional_reduction(seurat_obj, npcs = 50) + +# 4. Create visualizations +message("Creating visualizations...") + +# Quality control plots +qc_plots <- create_qc_plots(seurat_obj) + +# Save QC plots +ggsave("outputs/qc_violin_plot.png", qc_plots$violin, width = 12, height = 6) +ggsave("outputs/qc_scatter_plot1.png", qc_plots$feature_scatter1, width = 8, height = 6) +ggsave("outputs/qc_scatter_plot2.png", qc_plots$feature_scatter2, width = 8, height = 6) + +# PCA plot +pca_plot <- DimPlot(seurat_obj, reduction = "pca", group.by = "seurat_clusters") +ggsave("outputs/pca_plot.png", pca_plot, width = 8, height = 6) + +# UMAP plot +umap_plot <- DimPlot(seurat_obj, reduction = "umap", group.by = "seurat_clusters", + label = TRUE, pt.size = 0.5) + + ggtitle("UMAP Clustering") +ggsave("outputs/umap_plot.png", umap_plot, width = 8, height = 6) + +# Variable features plot +var_features_plot <- VariableFeaturePlot(seurat_obj) +top10_features <- head(VariableFeatures(seurat_obj), 10) +var_features_plot <- LabelPoints(plot = var_features_plot, points = top10_features, repel = TRUE) +ggsave("outputs/variable_features_plot.png", var_features_plot, width = 10, height = 6) + +# 5. Find marker genes for clusters +message("Finding marker genes...") +all_markers <- FindAllMarkers(seurat_obj, only.pos = TRUE, min.pct = 0.25, + logfc.threshold = 0.25) + +# Save marker genes +write.csv(all_markers, "outputs/cluster_markers.csv", row.names = FALSE) + +# Plot top markers +top_markers <- all_markers %>% + group_by(cluster) %>% + slice_max(n = 2, order_by = avg_log2FC) + +if(nrow(top_markers) > 0) { + marker_plot <- FeaturePlot(seurat_obj, features = head(top_markers$gene, 6), + ncol = 3) + ggsave("outputs/top_markers_plot.png", marker_plot, width = 15, height = 10) +} + +# 6. Create summary statistics +message("Creating summary statistics...") +summary_stats <- data.frame( + n_cells = ncol(seurat_obj), + n_genes = nrow(seurat_obj), + n_clusters = length(unique(Idents(seurat_obj))), + median_genes_per_cell = median(seurat_obj$nFeature_RNA), + median_UMI_per_cell = median(seurat_obj$nCount_RNA), + median_mito_percent = median(seurat_obj$percent.mt) +) + +write.csv(summary_stats, "outputs/analysis_summary.csv", row.names = FALSE) + +# Save the final Seurat object +saveRDS(seurat_obj, "outputs/processed_seurat_object.rds") + +message("Analysis complete! Check the outputs/ directory for results.") +print(summary_stats) \ No newline at end of file diff --git a/README.md b/README.md index 8c76ce0..edba99b 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,133 @@ # scRNA-R-Python-bridging -scRNA-R-Python-bridging description + +A minimal R single-cell template for single-cell RNA sequencing analysis with Seurat, SpatialExperiment, and Bioconductor packages. + +## Quick Start + +1. **Clone this repository:** + ```bash + git clone https://github.com/cc1001-ctrl/scRNA-R-Python-bridging.git + cd scRNA-R-Python-bridging + ``` + +2. **Install R dependencies using renv:** + ```r + # Install renv if not already installed + if (!requireNamespace("renv", quietly = TRUE)) { + install.packages("renv") + } + + # Restore the package environment + renv::restore() + ``` + +3. **Load required libraries:** + ```r + library(Seurat) + library(SpatialExperiment) + library(ggplot2) + library(dplyr) + ``` + +## Project Structure + +``` +├── R/ # R scripts and functions +├── data/ # Input data files +├── outputs/ # Analysis outputs and results +├── vignettes/ # Analysis notebooks and tutorials +├── renv.lock # R package dependencies +└── README.md # This file +``` + +## Dependencies + +This template includes the following key packages: + +- **Seurat** (v4.3.0): Comprehensive single-cell RNA-seq analysis +- **SpatialExperiment** (v1.10.0): Spatial transcriptomics data handling +- **ggplot2** (v3.4.2): Data visualization +- **Bioconductor** packages: Core infrastructure for genomics data +- **dplyr**: Data manipulation +- **Matrix**: Sparse matrix operations + +## renv Package Management + +This project uses `renv` for reproducible package management. The `renv.lock` file contains all package versions and dependencies. + +### Initial Setup +```r +# Install renv +install.packages("renv") + +# Initialize renv in your project (if not already done) +renv::init() + +# Restore packages from lockfile +renv::restore() +``` + +### Managing Dependencies +```r +# Install new packages +install.packages("package_name") +renv::snapshot() # Update lockfile + +# Update all packages +renv::update() + +# Check package status +renv::status() +``` + +## Example Usage + +```r +# Load libraries +library(Seurat) +library(ggplot2) + +# Create a simple Seurat object example +# (Replace with your actual data loading) +data("pbmc_small") +pbmc <- pbmc_small + +# Basic analysis workflow +pbmc <- NormalizeData(pbmc) +pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000) +pbmc <- ScaleData(pbmc) +pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc)) + +# Visualization +DimPlot(pbmc, reduction = "pca") +``` + +## Getting Help + +- [Seurat Documentation](https://satijalab.org/seurat/) +- [SpatialExperiment Documentation](https://bioconductor.org/packages/SpatialExperiment/) +- [Bioconductor Documentation](https://bioconductor.org/) + +## Citation + +If you use this template in your research, please cite: + +```bibtex +@software{scRNA_R_Python_bridging, + title = {scRNA-R-Python-bridging: Minimal R Single-Cell Template}, + author = {{CC1001-CTRL}}, + year = {2024}, + url = {https://github.com/cc1001-ctrl/scRNA-R-Python-bridging}, + note = {R template for single-cell RNA sequencing analysis} +} +``` + +### Package Citations + +- **Seurat**: Hao et al. "Integrated analysis of multimodal single-cell data." Cell (2021) +- **SpatialExperiment**: Righelli et al. "SpatialExperiment: infrastructure for spatially-resolved transcriptomics data in R using Bioconductor." Bioinformatics (2022) +- **ggplot2**: Wickham, H. "ggplot2: Elegant Graphics for Data Analysis." Springer-Verlag New York (2016) + +## License + +This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000..f606d0d --- /dev/null +++ b/data/README.md @@ -0,0 +1,36 @@ +# Data Directory + +This directory is intended for input data files for single-cell RNA sequencing analysis. + +## Recommended Data Formats + +### 10X Genomics Output +- `barcodes.tsv.gz` - Cell barcodes +- `features.tsv.gz` - Gene information +- `matrix.mtx.gz` - Count matrix + +### Other Formats +- `.h5` files (HDF5 format) +- `.csv` or `.tsv` files with count matrices +- `.rds` files with pre-processed R objects + +## Example Data Structure + +``` +data/ +├── experiment1/ +│ ├── barcodes.tsv.gz +│ ├── features.tsv.gz +│ └── matrix.mtx.gz +├── experiment2/ +│ └── data.h5 +└── metadata/ + ├── sample_info.csv + └── experimental_design.csv +``` + +## Usage + +Place your data files in this directory and modify the analysis scripts in the `R/` directory to point to your specific data files. + +**Note:** Large data files should be stored externally (e.g., on a server or cloud storage) and downloaded as needed. Consider adding large data files to `.gitignore` to avoid committing them to version control. \ No newline at end of file diff --git a/outputs/README.md b/outputs/README.md new file mode 100644 index 0000000..2f88b29 --- /dev/null +++ b/outputs/README.md @@ -0,0 +1,27 @@ +# Outputs Directory + +This directory contains analysis results and outputs generated by the R scripts. + +## Generated Files + +When you run the analysis scripts, the following types of files will be created here: + +### Plots and Visualizations +- `*.png` - Plot images (quality control, PCA, UMAP, etc.) +- `*.pdf` - High-resolution plots for publication + +### Data Tables +- `*.csv` - Analysis results (marker genes, summary statistics, etc.) +- `cluster_markers.csv` - Differentially expressed genes by cluster +- `analysis_summary.csv` - Summary statistics of the analysis + +### R Objects +- `*.rds` - Saved R objects (processed Seurat objects, etc.) +- `processed_seurat_object.rds` - Final processed single-cell data + +### Reports +- `*.html` - Rendered R Markdown reports + +## Note + +Files in this directory are typically generated by the analysis and should not be committed to version control (they are included in `.gitignore`). To reproduce the outputs, run the analysis scripts in the `R/` directory. \ No newline at end of file diff --git a/renv.lock b/renv.lock new file mode 100644 index 0000000..4c4a48a --- /dev/null +++ b/renv.lock @@ -0,0 +1,162 @@ +{ + "R": { + "Version": "4.3.0", + "Repositories": [ + { + "Name": "CRAN", + "URL": "https://cran.rstudio.com" + }, + { + "Name": "BioConductor", + "URL": "https://bioconductor.org/packages/3.17" + } + ] + }, + "Packages": { + "BiocGenerics": { + "Package": "BiocGenerics", + "Version": "0.46.0", + "Source": "Bioconductor", + "Repository": "BioConductor", + "RemoteType": "standard", + "RemotePkgRef": "BiocGenerics", + "RemoteRef": "BiocGenerics", + "RemoteRepos": "https://bioconductor.org/packages/3.17", + "RemotePkgPlatform": "source", + "Hash": "a0036df73063564267e2b8ff0c52b15d" + }, + "BiocManager": { + "Package": "BiocManager", + "Version": "1.30.20", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "b5c4bfd7b041c77f010b8a6f20a624c8" + }, + "S4Vectors": { + "Package": "S4Vectors", + "Version": "0.38.1", + "Source": "Bioconductor", + "Repository": "BioConductor", + "Hash": "6b1e8cda1b22b7be1a9f0b9dd8e95a19" + }, + "SingleCellExperiment": { + "Package": "SingleCellExperiment", + "Version": "1.22.0", + "Source": "Bioconductor", + "Repository": "BioConductor", + "Hash": "97fd86a1e7df2ce9b4f5b45a38b30a16" + }, + "SpatialExperiment": { + "Package": "SpatialExperiment", + "Version": "1.10.0", + "Source": "Bioconductor", + "Repository": "BioConductor", + "Hash": "bc9cc1e5a6ab2dc4c73b0e62c55b9b77" + }, + "SummarizedExperiment": { + "Package": "SummarizedExperiment", + "Version": "1.30.2", + "Source": "Bioconductor", + "Repository": "BioConductor", + "Hash": "b8c4a95f86e8e9e995e5b9f8c2e78dd5" + }, + "Seurat": { + "Package": "Seurat", + "Version": "4.3.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d3c9b3b3cd71b5e0eed1db7dd8de8e7a" + }, + "SeuratObject": { + "Package": "SeuratObject", + "Version": "4.1.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "a84c5bac45b8e2edd2b87b6f7f47e1ad" + }, + "Matrix": { + "Package": "Matrix", + "Version": "1.5-4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "e779c7d9d35297a634fd9959c8a42a6b" + }, + "ggplot2": { + "Package": "ggplot2", + "Version": "3.4.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "ad0d41a45d65a466af8b42c18fbbb8dc" + }, + "dplyr": { + "Package": "dplyr", + "Version": "1.1.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "dea6970ff715ca541c387de363ff405e" + }, + "tibble": { + "Package": "tibble", + "Version": "3.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "d6e4e1ad8b86a9e1a96f8b6c16b4f5dc" + }, + "tidyr": { + "Package": "tidyr", + "Version": "1.3.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "e47debdc7ce599b070c8e78e8ac0cfcf" + }, + "scales": { + "Package": "scales", + "Version": "1.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "906cb23d2f1c5680b8ce439b44c6fa63" + }, + "gtable": { + "Package": "gtable", + "Version": "0.3.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "b44addadb528a0d227794121c00572a0" + }, + "RColorBrewer": { + "Package": "RColorBrewer", + "Version": "1.1-3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "45f0398006e83a5b10b72a90663d8d8c" + }, + "viridisLite": { + "Package": "viridisLite", + "Version": "0.4.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "c826c7c4241b6fc89ff55aaea3fa7491" + }, + "cowplot": { + "Package": "cowplot", + "Version": "1.1.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "b418e8423699d11c7f2087c2bfd07da2" + }, + "patchwork": { + "Package": "patchwork", + "Version": "1.1.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "e83ed8453c1b538bd9ba5a67a8c51cd5" + }, + "renv": { + "Package": "renv", + "Version": "0.17.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "4543b8cd233ae25c6aba8548be9e747e" + } + } +} \ No newline at end of file diff --git a/vignettes/getting_started.Rmd b/vignettes/getting_started.Rmd new file mode 100644 index 0000000..5eb20fb --- /dev/null +++ b/vignettes/getting_started.Rmd @@ -0,0 +1,178 @@ +--- +title: "Getting Started with scRNA-R-Python-bridging" +author: "CC1001-CTRL" +date: "`r Sys.Date()`" +output: + html_document: + toc: true + toc_float: true + theme: united + highlight: tango +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, + fig.width = 8, fig.height = 6) +``` + +# Introduction + +This vignette demonstrates how to use the scRNA-R-Python-bridging template for single-cell RNA sequencing analysis. The template includes essential packages like Seurat, SpatialExperiment, and ggplot2, along with a reproducible environment managed by renv. + +## Package Installation + +If you haven't already set up the environment, install the required packages: + +```{r eval=FALSE} +# Install renv if not already installed +if (!requireNamespace("renv", quietly = TRUE)) { + install.packages("renv") +} + +# Restore the package environment +renv::restore() +``` + +## Load Required Libraries + +```{r load_libraries} +library(Seurat) +library(ggplot2) +library(dplyr) +library(patchwork) + +# Source our custom functions +source("../R/analysis_functions.R") +``` + +# Basic Single-Cell Analysis Workflow + +## 1. Data Loading and Preprocessing + +For this example, we'll use the built-in pbmc_small dataset that comes with Seurat: + +```{r load_data} +# Load example data +data("pbmc_small") +pbmc <- pbmc_small + +# Display basic information +pbmc +``` + +## 2. Quality Control + +Let's examine the quality control metrics: + +```{r qc_metrics} +# Add mitochondrial gene percentage +pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-") + +# Create QC plots +qc_plots <- create_qc_plots(pbmc) +qc_plots$violin +``` + +```{r qc_scatter} +# Feature scatter plots +qc_plots$feature_scatter1 +qc_plots$feature_scatter2 +``` + +## 3. Normalization and Feature Selection + +```{r normalization} +# Normalize and scale data +pbmc <- normalize_and_scale(pbmc, nfeatures = 2000) + +# Show top variable features +top10 <- head(VariableFeatures(pbmc), 10) +print(top10) + +# Plot variable features +var_plot <- VariableFeaturePlot(pbmc) +var_plot <- LabelPoints(plot = var_plot, points = top10, repel = TRUE) +var_plot +``` + +## 4. Dimensional Reduction + +```{r dimensionality_reduction} +# Run PCA, clustering, and UMAP +pbmc <- run_dimensional_reduction(pbmc, npcs = 50) + +# PCA plot +DimPlot(pbmc, reduction = "pca") +``` + +```{r umap} +# UMAP plot +DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + + ggtitle("UMAP of PBMC Clusters") +``` + +## 5. Cluster Analysis + +```{r cluster_analysis} +# Number of cells per cluster +table(Idents(pbmc)) + +# Find marker genes for each cluster +markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) + +# Display top markers for each cluster +top_markers <- markers %>% + group_by(cluster) %>% + slice_max(n = 3, order_by = avg_log2FC) + +print(top_markers) +``` + +## 6. Visualization of Marker Genes + +```{r marker_visualization} +# Plot expression of top marker genes +if(nrow(top_markers) > 0) { + feature_genes <- head(unique(top_markers$gene), 6) + FeaturePlot(pbmc, features = feature_genes, ncol = 3) +} +``` + +## 7. Spatial Analysis with SpatialExperiment + +For spatial transcriptomics data, you can use SpatialExperiment: + +```{r spatial_example, eval=FALSE} +library(SpatialExperiment) + +# Example of creating a SpatialExperiment object +# (This is just a template - replace with your actual spatial data) +# spe <- SpatialExperiment( +# assays = list(counts = your_count_matrix), +# colData = your_cell_metadata, +# spatialCoords = your_spatial_coordinates +# ) +``` + +# Session Information + +```{r session_info} +sessionInfo() +``` + +# Summary + +This vignette demonstrated: + +1. Loading and quality control of single-cell data +2. Normalization and feature selection +3. Dimensional reduction with PCA and UMAP +4. Clustering and marker gene identification +5. Visualization techniques + +The template provides a solid foundation for single-cell RNA-seq analysis with reproducible package management through renv. + +For more advanced analyses, refer to: +- [Seurat documentation](https://satijalab.org/seurat/) +- [SpatialExperiment vignettes](https://bioconductor.org/packages/SpatialExperiment/) +- [Bioconductor workflows](https://bioconductor.org/help/workflows/) \ No newline at end of file